from PIL import Image
img=Image.open('Customer-Churn-1.png')
img
import pandas as pd
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import itertools
import missingno as msno
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
pd.options.display.float_format = '{:.2f}'.format
import warnings
warnings.filterwarnings('ignore')
churn1=pd.read_csv('C:/Users/HP/Downloads/churn_modelling1.csv')
churn1.head()
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 |
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 |
| 2 | 3.00 | 15619304.00 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
churn2=pd.read_csv('C:/Users/HP/Downloads/churn_modelling2.csv')
churn2.head()
| Row_Number | Has_CrCard | ISACTIVEMEMBER | Estimated Salary | Exited | |
|---|---|---|---|---|---|
| 0 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | NaN | NaN | NaN | NaN | NaN |
# USEING RENAME FUNCTION TO MAKE ALL COLUMNS NAME CONSISTENT
churn2.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
churn2.rename(columns={'Row_Number':'RowNumber','Has_CrCard':'HasCrCard'},inplace = True)
churn2.rename(columns={' Estimated Salary':' estimated salary'},inplace=True)
churn2.columns
Index(['RowNumber', 'HasCrCard', 'ISACTIVEMEMBER', ' estimated salary',
'Exited'],
dtype='object')
churn2.head()
| RowNumber | HasCrCard | ISACTIVEMEMBER | estimated salary | Exited | |
|---|---|---|---|---|---|
| 0 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | NaN | NaN | NaN | NaN | NaN |
churn2.columns=churn2.columns.str.replace(' ','')
churn2.columns
Index(['RowNumber', 'HasCrCard', 'ISACTIVEMEMBER', 'estimatedsalary',
'Exited'],
dtype='object')
churn2_col=['rownumber', 'hascrcard', 'isactivemember', 'estimatedsalary','exited']
churn2.columns=churn2_col
churn2.head(2)
| rownumber | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|
| 0 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
churn1_cols=['rownumber','customerid','surname','creditscore','geography','gender','age','tenure','balance','numofproducts']
churn1.columns=churn1_cols
churn1.columns
Index(['rownumber', 'customerid', 'surname', 'creditscore', 'geography',
'gender', 'age', 'tenure', 'balance', 'numofproducts'],
dtype='object')
churn1.head()
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 |
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 |
| 2 | 3.00 | 15619304.00 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
churn1=pd.read_csv('C:/Users/HP/Downloads/churn_modelling1.csv',names=churn1_cols,header=0)
churn1.head()
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 |
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 |
| 2 | 3.00 | 15619304.00 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
churn=pd.merge(churn1,churn2,on='rownumber',how='left')
#TO SHOW THE 5 FIRST ROWS OF THE DATASET
churn.head()
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 15619304.00 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
churn
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 15619304.00 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10027 | NaN | NaN | Johnstone | 516.00 | France | Male | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10028 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10029 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10030 | 9999.00 | NaN | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 1.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | 10000.00 | 15628319.00 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 1.00 | 0.00 | 38190.78 | 0.00 |
10032 rows × 14 columns
# TO SHOW THE LAST 5 ROWS OF THE DATASET
churn.tail()
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10027 | NaN | NaN | Johnstone | 516.00 | France | Male | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10028 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10029 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10030 | 9999.00 | NaN | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 1.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | 10000.00 | 15628319.00 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 1.00 | 0.00 | 38190.78 | 0.00 |
churn.index
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8,
9,
...
10022, 10023, 10024, 10025, 10026, 10027, 10028, 10029, 10030,
10031],
dtype='int64', length=10032)
churn.sample
<bound method NDFrame.sample of rownumber customerid surname creditscore geography gender age \
0 1.00 15634602.00 Hargrave 619.00 France Female 42.00
1 2.00 NaN Hill 608.00 Spain Female 41.00
2 3.00 15619304.00 Onio 502.00 France Female 42.00
3 4.00 NaN Boni 699.00 France Female 39.00
4 5.00 NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ...
10027 NaN NaN Johnstone 516.00 France Male 35.00
10028 NaN NaN Liu 709.00 France Female 36.00
10029 NaN NaN Liu 709.00 France Female 36.00
10030 9999.00 NaN Sabbatini 772.00 Germany Male 42.00
10031 10000.00 15628319.00 Walker 792.00 France Female 28.00
tenure balance numofproducts hascrcard isactivemember \
0 2.00 0.00 1.00 1.00 1.00
1 1.00 83807.86 1.00 0.00 1.00
2 8.00 159660.80 3.00 1.00 0.00
3 1.00 0.00 2.00 0.00 0.00
4 NaN NaN NaN NaN NaN
... ... ... ... ... ...
10027 10.00 57369.61 1.00 NaN NaN
10028 7.00 0.00 1.00 NaN NaN
10029 7.00 0.00 1.00 NaN NaN
10030 3.00 75075.31 2.00 1.00 0.00
10031 4.00 130142.79 1.00 1.00 0.00
estimatedsalary exited
0 101348.88 1.00
1 112542.58 0.00
2 113931.57 1.00
3 93826.63 0.00
4 NaN NaN
... ... ...
10027 NaN NaN
10028 NaN NaN
10029 NaN NaN
10030 92888.52 1.00
10031 38190.78 0.00
[10032 rows x 14 columns]>
churn.sample(frac=0.5)
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2793 | 2787.00 | 15585100.00 | Rioux | 511.00 | Germany | Female | 40.00 | 9.00 | 124401.60 | 1.00 | 1.00 | 0.00 | 198814.24 | 1.00 |
| 752 | 746.00 | 15640059.00 | Smith | 606.00 | France | Male | 40.00 | 5.00 | 0.00 | 2.00 | 1.00 | 1.00 | 70899.27 | 0.00 |
| 3262 | 3256.00 | 15671387.00 | Fetherstonhaugh | 507.00 | France | Female | 29.00 | 4.00 | 89349.47 | 2.00 | 0.00 | 0.00 | 180626.68 | 0.00 |
| 6666 | 6650.00 | 15635277.00 | Coates | 605.00 | Spain | Male | 47.00 | 7.00 | 142643.54 | 1.00 | 1.00 | 0.00 | 189310.27 | 0.00 |
| 4668 | 4662.00 | 15670416.00 | Ferri | 780.00 | France | Female | 43.00 | 0.00 | 0.00 | 1.00 | 0.00 | 1.00 | 15705.27 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7524 | 7508.00 | 15642001.00 | Lorenzen | 576.00 | Germany | Male | 44.00 | 9.00 | 119530.52 | 1.00 | 1.00 | 0.00 | 119056.68 | 1.00 |
| 8007 | 7991.00 | 15643635.00 | Robertson | 664.00 | Spain | Male | 32.00 | 5.00 | 133705.74 | 1.00 | 0.00 | 0.00 | 134455.84 | 0.00 |
| 9916 | 9900.00 | 15811594.00 | Gordon | 660.00 | Spain | Female | 28.00 | 3.00 | 128929.88 | 1.00 | 1.00 | 1.00 | 198069.71 | 0.00 |
| 3519 | 3513.00 | NaN | Boylan | 806.00 | Spain | Male | 18.00 | 3.00 | 0.00 | 2.00 | 1.00 | 1.00 | 86994.54 | 0.00 |
| 7472 | 7456.00 | 15748499.00 | Johnson | 550.00 | Germany | Male | 33.00 | 4.00 | 118400.91 | 1.00 | 0.00 | 1.00 | 13999.64 | 1.00 |
5016 rows × 14 columns
churn.sample(7)
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1970 | 1964.00 | 15771139.00 | Douglas | 578.00 | Germany | Male | 34.00 | 8.00 | 147487.23 | 2.00 | 1.00 | 0.00 | 66680.77 | 0.00 |
| 3998 | 3992.00 | 15601659.00 | Fiorentino | 496.00 | Germany | Female | 59.00 | 7.00 | 91680.10 | 2.00 | 1.00 | 0.00 | 163141.18 | 1.00 |
| 4513 | 4507.00 | 15635177.00 | Williamson | 597.00 | Spain | Female | 66.00 | 3.00 | 0.00 | 1.00 | 1.00 | 1.00 | 70532.53 | 0.00 |
| 8960 | 8944.00 | 15727350.00 | Pai | 516.00 | France | Female | 37.00 | 8.00 | 113143.12 | 1.00 | 0.00 | 0.00 | 3363.36 | 0.00 |
| 9790 | 9774.00 | 15698462.00 | Chiu | 532.00 | France | Male | 36.00 | 4.00 | 0.00 | 2.00 | 1.00 | 1.00 | 132798.78 | 0.00 |
| 3666 | 3660.00 | 15664668.00 | Zarate | 534.00 | France | Female | 42.00 | 9.00 | 144801.97 | 1.00 | 0.00 | 1.00 | 12483.39 | 1.00 |
| 1860 | 1854.00 | 15633574.00 | Montes | 730.00 | France | Female | 41.00 | 4.00 | 167545.32 | 1.00 | 1.00 | 0.00 | 128246.81 | 0.00 |
churn.sample(7,random_state=0)
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7109 | 7093.00 | 15723884.00 | Nekrasova | 758.00 | Spain | Male | 40.00 | 3.00 | 0.00 | 2.00 | 0.00 | 0.00 | 96097.65 | 0.00 |
| 3847 | 3841.00 | 15662884.00 | Naylor | 739.00 | Germany | Male | 58.00 | 1.00 | 110597.76 | 1.00 | 0.00 | 1.00 | 160122.66 | 1.00 |
| 5957 | 5941.00 | 15791958.00 | Mazzi | 849.00 | France | Female | 41.00 | 6.00 | 0.00 | 2.00 | 1.00 | 1.00 | 169203.51 | 1.00 |
| 6787 | 6771.00 | 15815295.00 | John | 662.00 | France | Female | 38.00 | 2.00 | 96479.81 | 1.00 | 1.00 | 0.00 | 120259.41 | 0.00 |
| 2849 | 2843.00 | 15785782.00 | Ugonna | 513.00 | Spain | Male | 48.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 114709.13 | 1.00 |
| 3523 | 3517.00 | 15787151.00 | Liao | 638.00 | France | Female | 34.00 | 7.00 | 0.00 | 2.00 | 1.00 | 1.00 | 198969.78 | 0.00 |
| 6287 | 6271.00 | 15702806.00 | Martin | 696.00 | Spain | Male | 24.00 | 9.00 | 0.00 | 1.00 | 0.00 | 0.00 | 10883.52 | 0.00 |
churn
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 15619304.00 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10027 | NaN | NaN | Johnstone | 516.00 | France | Male | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10028 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10029 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10030 | 9999.00 | NaN | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 1.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | 10000.00 | 15628319.00 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 1.00 | 0.00 | 38190.78 | 0.00 |
10032 rows × 14 columns
#LETS VISUALIZE THE DISTRIBUTION OF ANY COLUMN JUST TO SEE HOW THE COLUMNS ARE SPREAD
churn['creditscore'].sample(frac=0.001).plot.bar()
<AxesSubplot:>
churn.columns
Index(['creditscore', 'age', 'tenure', 'balance', 'numofproducts',
'isactivemember', 'estimatedsalary', 'exited', 'geography1',
'geography_Germany', 'geography_Spain', 'gender_Male'],
dtype='object')
list(churn.columns)
['creditscore', 'age', 'tenure', 'balance', 'numofproducts', 'isactivemember', 'estimatedsalary', 'exited', 'geography1', 'geography_Germany', 'geography_Spain', 'gender_Male']
churn.size
119676
churn.shape
(9973, 12)
print('number of columns',churn.shape[1])
number of columns 12
print('number of rows',churn.shape[0])
number of rows 9973
churn.dtypes
rownumber float64 customerid float64 surname object creditscore float64 geography object gender object age float64 tenure float64 balance float64 numofproducts float64 hascrcard float64 isactivemember float64 estimatedsalary float64 exited float64 dtype: object
churn_catg=churn.select_dtypes('object')
churn['geography'].value_counts().plot.bar()
<AxesSubplot:>
churn_cont=churn.select_dtypes('number')
churn_catg
| surname | geography | gender | |
|---|---|---|---|
| 0 | Hargrave | France | Female |
| 1 | Hill | Spain | Female |
| 2 | Onio | France | Female |
| 3 | Boni | France | Female |
| 4 | NaN | NaN | NaN |
| ... | ... | ... | ... |
| 10027 | Johnstone | France | Male |
| 10028 | Liu | France | Female |
| 10029 | Liu | France | Female |
| 10030 | Sabbatini | Germany | Male |
| 10031 | Walker | France | Female |
10032 rows × 3 columns
churn_cont
| rownumber | customerid | creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 15619304.00 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | NaN | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10027 | NaN | NaN | 516.00 | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10028 | NaN | NaN | 709.00 | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10029 | NaN | NaN | 709.00 | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10030 | 9999.00 | NaN | 772.00 | 42.00 | 3.00 | 75075.31 | 2.00 | 1.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | 10000.00 | 15628319.00 | 792.00 | 28.00 | 4.00 | 130142.79 | 1.00 | 1.00 | 0.00 | 38190.78 | 0.00 |
10032 rows × 11 columns
#UNIQUE VALUES SHOWS THE TYPE OF VALUE IN EACH COLUMN
churn['estimatedsalary'].nunique()
9970
churn['estimatedsalary'].unique
<bound method Series.unique of 0 101348.88
1 112542.58
2 113931.57
3 93826.63
5 149756.71
...
9997 99595.67
10006 96833.00
10025 96270.64
10030 92888.52
10031 38190.78
Name: estimatedsalary, Length: 9973, dtype: float64>
churn['estimatedsalary'].unique()[:20]
array([101348.88, 112542.58, 113931.57, 93826.63, nan, 149756.71,
119346.88, 74940.5 , 71725.73, 80181.12, 76390.01, 26260.98,
190857.79, 65951.65, 64327.26, 5097.67, 14406.41, 158684.81,
54724.03, 170886.17])
churn.duplicated()
0 False
1 False
2 False
3 False
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 False
Length: 9973, dtype: bool
churn.duplicated().any()
True
churn_dup=churn.duplicated().any()
print(churn_dup)
True
churn[churn.duplicated()]
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 15 | 14.00 | 15691483.00 | Chin | 549.00 | France | Female | 25.00 | 5.00 | 0.00 | 2.00 | 0.00 | 0.00 | 190857.79 | 0.00 |
| 16 | 14.00 | 15691483.00 | Chin | 549.00 | France | Female | 25.00 | 5.00 | 0.00 | 2.00 | 0.00 | 0.00 | 190857.79 | 0.00 |
| 17 | 14.00 | 15691483.00 | Chin | 549.00 | France | Female | 25.00 | 5.00 | 0.00 | 2.00 | 0.00 | 0.00 | 190857.79 | 0.00 |
| 19 | 15.00 | 15600882.00 | Scott | 635.00 | Spain | Female | 35.00 | 7.00 | 0.00 | 2.00 | 1.00 | 1.00 | 65951.65 | 0.00 |
| 20 | 15.00 | 15600882.00 | Scott | 635.00 | Spain | Female | 35.00 | 7.00 | 0.00 | 2.00 | 1.00 | 1.00 | 65951.65 | 0.00 |
| 21 | 15.00 | 15600882.00 | Scott | 635.00 | Spain | Female | 35.00 | 7.00 | 0.00 | 2.00 | 1.00 | 1.00 | 65951.65 | 0.00 |
| 5278 | NaN | 15607230.00 | Michel | 588.00 | Germany | Male | 33.00 | 9.00 | 150186.22 | 2.00 | NaN | NaN | NaN | NaN |
| 5280 | NaN | 15567630.00 | Bruce | 721.00 | Germany | Male | 40.00 | 6.00 | 100275.88 | 1.00 | NaN | NaN | NaN | NaN |
| 5282 | NaN | 15587507.00 | Feng | 850.00 | France | Male | 47.00 | 6.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 5284 | NaN | 15733904.00 | McDonald | 529.00 | France | Male | 32.00 | 9.00 | 147493.89 | 1.00 | NaN | NaN | NaN | NaN |
| 5286 | NaN | NaN | Watt | 622.00 | France | Male | 43.00 | 8.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 5288 | NaN | NaN | Goodwin | 683.00 | France | Female | 42.00 | 8.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 5290 | NaN | NaN | Arcuri | 575.00 | France | Male | 29.00 | 4.00 | 121823.40 | 2.00 | NaN | NaN | NaN | NaN |
| 5292 | NaN | NaN | Rapuluolisa | 577.00 | Spain | Female | 40.00 | 1.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 5294 | NaN | NaN | Kao | 510.00 | Germany | Male | 31.00 | 0.00 | 113688.63 | 1.00 | NaN | NaN | NaN | NaN |
| 5296 | NaN | NaN | Tsui | 652.00 | Spain | Female | 40.00 | 7.00 | 100471.34 | 1.00 | NaN | NaN | NaN | NaN |
| 9999 | NaN | NaN | Burbidge | 498.00 | Germany | Male | 42.00 | 3.00 | 152039.70 | 1.00 | NaN | NaN | NaN | NaN |
| 10001 | NaN | NaN | Griffin | 655.00 | Germany | Female | 46.00 | 7.00 | 137145.12 | 1.00 | NaN | NaN | NaN | NaN |
| 10003 | NaN | NaN | Cocci | 613.00 | France | Male | 40.00 | 4.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10005 | NaN | 15696175.00 | Echezonachukwu | 602.00 | Germany | Male | 35.00 | 7.00 | 90602.42 | 2.00 | NaN | NaN | NaN | NaN |
| 10008 | NaN | NaN | Bartlett | 673.00 | Germany | Male | 47.00 | 1.00 | 183579.54 | 2.00 | NaN | NaN | NaN | NaN |
| 10010 | NaN | NaN | Mancini | 606.00 | Spain | Male | 30.00 | 8.00 | 180307.73 | 2.00 | NaN | NaN | NaN | NaN |
| 10012 | NaN | NaN | Pirozzi | 775.00 | France | Male | 30.00 | 4.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 10014 | NaN | NaN | McMillan | 841.00 | Spain | Male | 28.00 | 4.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 10016 | NaN | 15798964.00 | Nkemakonam | 714.00 | Germany | Male | 33.00 | 3.00 | 35016.60 | 1.00 | NaN | NaN | NaN | NaN |
| 10018 | NaN | NaN | Ajuluchukwu | 597.00 | France | Female | 53.00 | 4.00 | 88381.21 | 1.00 | NaN | NaN | NaN | NaN |
| 10020 | NaN | NaN | Chukwualuka | 726.00 | Spain | Male | 36.00 | 2.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10022 | NaN | NaN | Rahman | 644.00 | France | Male | 28.00 | 7.00 | 155060.41 | 1.00 | NaN | NaN | NaN | NaN |
| 10024 | NaN | NaN | Wood | 800.00 | France | Female | 29.00 | 2.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 10027 | NaN | NaN | Johnstone | 516.00 | France | Male | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10029 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
churn.drop_duplicates(inplace=True)
churn_dup=churn.duplicated().any()
print(churn_dup)
False
churn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 0 to 10031 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rownumber 9974 non-null float64 1 customerid 8515 non-null float64 2 surname 9998 non-null object 3 creditscore 9998 non-null float64 4 geography 9998 non-null object 5 gender 9998 non-null object 6 age 9998 non-null float64 7 tenure 9998 non-null float64 8 balance 9998 non-null float64 9 numofproducts 9998 non-null float64 10 hascrcard 9973 non-null float64 11 isactivemember 9973 non-null float64 12 estimatedsalary 9973 non-null float64 13 exited 9973 non-null float64 dtypes: float64(11), object(3) memory usage: 1.1+ MB
churn.count()
rownumber 9974 customerid 8515 surname 9998 creditscore 9998 geography 9998 gender 9998 age 9998 tenure 9998 balance 9998 numofproducts 9998 hascrcard 9973 isactivemember 9973 estimatedsalary 9973 exited 9973 dtype: int64
churn.isnull().sum()
rownumber 26 customerid 1485 surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 hascrcard 27 isactivemember 27 estimatedsalary 27 exited 27 dtype: int64
churn.isnull().sum().plot.bar()
<AxesSubplot:>
churn.isnull().sum().sum()
1635
churn_missing=churn[churn.isnull().any(1)]
churn_missing
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2.00 | NaN | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 3 | 4.00 | NaN | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | 6.00 | NaN | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 1.00 | 0.00 | 149756.71 | 1.00 |
| 6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10023 | NaN | NaN | Wood | 800.00 | France | Female | 29.00 | 2.00 | 0.00 | 2.00 | NaN | NaN | NaN | NaN |
| 10025 | 9996.00 | NaN | Obijiaku | 771.00 | France | Male | 39.00 | 5.00 | 0.00 | 2.00 | 1.00 | 0.00 | 96270.64 | 0.00 |
| 10026 | NaN | NaN | Johnstone | 516.00 | France | Male | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10028 | NaN | NaN | Liu | 709.00 | France | Female | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10030 | 9999.00 | NaN | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 1.00 | 0.00 | 92888.52 | 1.00 |
1491 rows × 14 columns
churn.notnull()
| rownumber | customerid | surname | creditscore | geography | gender | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 1 | True | False | True | True | True | True | True | True | True | True | True | True | True | True |
| 2 | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
| 3 | True | False | True | True | True | True | True | True | True | True | True | True | True | True |
| 4 | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10025 | True | False | True | True | True | True | True | True | True | True | True | True | True | True |
| 10026 | False | False | True | True | True | True | True | True | True | True | False | False | False | False |
| 10028 | False | False | True | True | True | True | True | True | True | True | False | False | False | False |
| 10030 | True | False | True | True | True | True | True | True | True | True | True | True | True | True |
| 10031 | True | True | True | True | True | True | True | True | True | True | True | True | True | True |
10000 rows × 14 columns
churn_catg.isnull().sum()
surname 3 geography 3 gender 3 dtype: int64
churn_catg.isnull().sum().plot.bar()
<AxesSubplot:>
churn_cont.isnull().sum()
rownumber 52 customerid 1505 creditscore 3 age 3 tenure 3 balance 3 numofproducts 3 hascrcard 53 isactivemember 53 estimatedsalary 53 exited 53 dtype: int64
churn_cont.isnull().sum().plot.bar()
<AxesSubplot:>
(len(churn))
10000
(len(churn))*100
1000000
churn_percentage=churn.isnull().sum()/(len(churn))*100
churn_percentage
rownumber 0.26 customerid 14.85 surname 0.02 creditscore 0.02 geography 0.02 gender 0.02 age 0.02 tenure 0.02 balance 0.02 numofproducts 0.02 hascrcard 0.27 isactivemember 0.27 estimatedsalary 0.27 exited 0.27 dtype: float64
(churn_percentage)*round(4)
rownumber 1.04 customerid 59.40 surname 0.08 creditscore 0.08 geography 0.08 gender 0.08 age 0.08 tenure 0.08 balance 0.08 numofproducts 0.08 hascrcard 1.08 isactivemember 1.08 estimatedsalary 1.08 exited 1.08 dtype: float64
import missingno as msn
msn.matrix(churn)
<AxesSubplot:>
msn.bar(churn)
<AxesSubplot:>
msn.dendrogram(churn)
<AxesSubplot:>
msn.heatmap(churn)
<AxesSubplot:>
churn_nullvalue = pd.DataFrame((churn.isnull().sum())*100/churn.shape[0]).reset_index()
churn_nullvalue.columns = ['Column Name', 'Null Values Percentage']
fig = plt.figure(figsize=(18,6))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=churn_nullvalue,color='green')
plt.xticks(rotation =90,fontsize =8)
ax.axhline(40, ls='--',color='red')
plt.title("Percentage of Missing values in application data")
plt.ylabel("Null Values PERCENTAGE")
plt.xlabel("COLUMNS")
plt.show()
sns.heatmap(churn.isnull())
<AxesSubplot:>
churn.isnull().sum()
rownumber 26 customerid 1485 surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 hascrcard 27 isactivemember 27 estimatedsalary 27 exited 27 dtype: int64
churn=churn.drop(labels=['customerid'],axis=1)
churn.isnull().sum()
rownumber 26 surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 hascrcard 27 isactivemember 27 estimatedsalary 27 exited 27 dtype: int64
churn=churn.drop(labels=['rownumber','hascrcard'],axis=1)
churn.isnull().sum()
surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 isactivemember 27 estimatedsalary 27 exited 27 dtype: int64
churn.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
len(churn.columns)
11
churn_cont.isnull().sum().plot.bar()
<AxesSubplot:>
churn_cont
| rownumber | customerid | creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 3.00 | 15619304.00 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 1.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 4.00 | NaN | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 0.00 | 93826.63 | 0.00 |
| 4 | 5.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10027 | NaN | NaN | 516.00 | 35.00 | 10.00 | 57369.61 | 1.00 | NaN | NaN | NaN | NaN |
| 10028 | NaN | NaN | 709.00 | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10029 | NaN | NaN | 709.00 | 36.00 | 7.00 | 0.00 | 1.00 | NaN | NaN | NaN | NaN |
| 10030 | 9999.00 | NaN | 772.00 | 42.00 | 3.00 | 75075.31 | 2.00 | 1.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | 10000.00 | 15628319.00 | 792.00 | 28.00 | 4.00 | 130142.79 | 1.00 | 1.00 | 0.00 | 38190.78 | 0.00 |
10032 rows × 11 columns
churn.isnull().sum()
surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 isactivemember 27 estimatedsalary 27 exited 27 dtype: int64
churn.isnull().sum().sum()
97
mean_churn1=churn['estimatedsalary'].mean()
mean_churn1
100093.52722149782
churn['estimatedsalary'].fillna(mean_churn1,inplace=True)
churn.isnull().sum()
surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 isactivemember 27 estimatedsalary 0 exited 27 dtype: int64
churn.isnull().sum().sum()
70
churn['isactivemember'].unique()
array([ 1., 0., nan])
churn.isnull().sum()
surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 isactivemember 27 estimatedsalary 0 exited 27 dtype: int64
churn.isnull().sum().sum()
70
churn_cont.head(2)
| rownumber | customerid | creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
#drop all categorical or object or string missing values
churn.loc[:,['surname','geography',]].dropna()
| surname | geography | |
|---|---|---|
| 0 | Hargrave | France |
| 1 | Hill | Spain |
| 2 | Onio | France |
| 3 | Boni | France |
| 5 | Chu | Spain |
| ... | ... | ... |
| 10025 | Obijiaku | France |
| 10026 | Johnstone | France |
| 10028 | Liu | France |
| 10030 | Sabbatini | Germany |
| 10031 | Walker | France |
9998 rows × 2 columns
churn.isnull().sum()
surname 2 creditscore 2 geography 2 gender 2 age 2 tenure 2 balance 2 numofproducts 2 isactivemember 27 estimatedsalary 0 exited 27 dtype: int64
churn.gender.unique()
array(['Female', nan, 'Male'], dtype=object)
len(churn.gender.unique())
3
churn_mod=churn.gender.value_counts()
churn_mod
Male 5456 Female 4542 Name: gender, dtype: int64
#mode is male
churn['gender'].fillna('Male',inplace=True)
churn['gender'].fillna('churn_mod',inplace=True)
churn.isnull().sum()
surname 2 creditscore 2 geography 2 gender 0 age 2 tenure 2 balance 2 numofproducts 2 isactivemember 27 estimatedsalary 0 exited 27 dtype: int64
churn.isnull().sum().sum()
68
churn.dropna(inplace=True)
churn.isnull().sum()
surname 0 creditscore 0 geography 0 gender 0 age 0 tenure 0 balance 0 numofproducts 0 isactivemember 0 estimatedsalary 0 exited 0 dtype: int64
churn.isnull().sum().sum()
0
#check new visuals
msn.matrix(churn)
<AxesSubplot:>
msn.bar(churn)
<AxesSubplot:>
msn.dendrogram(churn)
<AxesSubplot:>
msn.heatmap(churn)
<AxesSubplot:>
churn_nullvalue = pd.DataFrame((churn.isnull().sum())*100/churn.shape[0]).reset_index()
churn_nullvalue.columns = ['Column Name', 'Null Values Percentage']
fig = plt.figure(figsize=(18,6))
ax = sns.pointplot(x="Column Name",y="Null Values Percentage",data=churn_nullvalue,color='green')
plt.xticks(rotation =90,fontsize =8)
ax.axhline(40, ls='--',color='red')
plt.title("Percentage of Missing values in application data")
plt.ylabel("Null Values PERCENTAGE")
plt.xlabel("COLUMNS")
plt.show()
sns.heatmap(churn.isnull())
<AxesSubplot:>
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.describe()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|
| count | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 |
| mean | 650.49 | 38.93 | 5.01 | 76498.54 | 1.53 | 0.52 | 100093.53 | 0.20 |
| std | 96.63 | 10.50 | 2.89 | 62383.80 | 0.58 | 0.50 | 57494.40 | 0.40 |
| min | 350.00 | 18.00 | 0.00 | 0.00 | 1.00 | 0.00 | 11.58 | 0.00 |
| 25% | 584.00 | 32.00 | 3.00 | 0.00 | 1.00 | 0.00 | 51099.56 | 0.00 |
| 50% | 652.00 | 37.00 | 5.00 | 97234.58 | 1.00 | 1.00 | 100187.43 | 0.00 |
| 75% | 717.00 | 44.00 | 7.00 | 127642.44 | 2.00 | 1.00 | 149381.32 | 0.00 |
| max | 850.00 | 92.00 | 10.00 | 250898.09 | 4.00 | 1.00 | 199992.48 | 1.00 |
sns.distplot(churn['balance'],kde=False,hist=True,bins=12)
plt.title('balance distribution',size=16)
plt.ylabel('count')
Text(0, 0.5, 'count')
churn.boxplot()
<AxesSubplot:>
sns.distplot(churn['balance'])
<AxesSubplot:xlabel='balance', ylabel='Density'>
churn['estimatedsalary'].plot(kind='box',figsize=(3,4),patch_artist=True)
<AxesSubplot:>
sns.boxplot(x='gender',y='age',data=churn)
<AxesSubplot:xlabel='gender', ylabel='age'>
churn['balance'].plot(kind='box',figsize=(3,4),patch_artist=True)
<AxesSubplot:>
churn['age'].unique()
array([42., 41., 39., 44., 29., 27., 31., 24., 34., 25., 35., 45., 58.,
32., 38., 46., 36., 43., 33., 40., 51., 61., 49., 37., 19., 66.,
56., 26., 21., 55., 75., 22., 30., 28., 65., 48., 52., 50., 57.,
73., 47., 54., 72., 20., 67., 79., 62., 53., 80., 59., 68., 23.,
60., 70., 63., 64., 18., 82., 69., 74., 71., 76., 77., 88., 85.,
84., 78., 81., 92., 83.])
sns.boxplot(y='age',data=churn)
<AxesSubplot:ylabel='age'>
churn['age'].mean()
38.9250977639627
churn1=churn[churn['age']<=70]
sns.boxplot(y='age',data=churn1)
<AxesSubplot:ylabel='age'>
churn2=churn[churn['age']<=58]
sns.boxplot(y='age',data=churn2)
<AxesSubplot:ylabel='age'>
sns.boxplot(y='balance',data=churn)
<AxesSubplot:ylabel='balance'>
churn2=churn
churn['balance'].mean()
76498.54009425409
churn.shape
(9973, 11)
churn.boxplot()
<AxesSubplot:>
churn.hist()
array([[<AxesSubplot:title={'center':'creditscore'}>,
<AxesSubplot:title={'center':'age'}>,
<AxesSubplot:title={'center':'tenure'}>],
[<AxesSubplot:title={'center':'balance'}>,
<AxesSubplot:title={'center':'numofproducts'}>,
<AxesSubplot:title={'center':'isactivemember'}>],
[<AxesSubplot:title={'center':'estimatedsalary'}>,
<AxesSubplot:title={'center':'exited'}>, <AxesSubplot:>]],
dtype=object)
churn.describe()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|
| count | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 |
| mean | 650.49 | 38.93 | 5.01 | 76498.54 | 1.53 | 0.52 | 100093.53 | 0.20 |
| std | 96.63 | 10.50 | 2.89 | 62383.80 | 0.58 | 0.50 | 57494.40 | 0.40 |
| min | 350.00 | 18.00 | 0.00 | 0.00 | 1.00 | 0.00 | 11.58 | 0.00 |
| 25% | 584.00 | 32.00 | 3.00 | 0.00 | 1.00 | 0.00 | 51099.56 | 0.00 |
| 50% | 652.00 | 37.00 | 5.00 | 97234.58 | 1.00 | 1.00 | 100187.43 | 0.00 |
| 75% | 717.00 | 44.00 | 7.00 | 127642.44 | 2.00 | 1.00 | 149381.32 | 0.00 |
| max | 850.00 | 92.00 | 10.00 | 250898.09 | 4.00 | 1.00 | 199992.48 | 1.00 |
churn.shape
(9973, 11)
#DEFINE A FUNCTION CALLED plot_box
#and there will be two arguments call it df and ft
#and plot the boxplot with the boxplot function in pandas
def plot_boxplot(churn1,ft):
churn.boxplot(column=[ft])
plt.grid(False)
plt.show()
#lets call it
plot_boxplot(churn,'creditscore')
sns.boxplot(y='age',data=churn1)
<AxesSubplot:ylabel='age'>
sns.boxplot(y='creditscore',data=churn1)
<AxesSubplot:ylabel='creditscore'>
plot_boxplot(churn,'tenure')
plot_boxplot(churn,'balance')
plot_boxplot(churn,'numofproducts')
churn1=churn[churn['numofproducts']<=3.5]
plot_boxplot(churn1,'numofproducts')
plot_boxplot(churn,'estimatedsalary')
#FIRST WE WILL EXTRAT ALL OUTLIERS IN ALL COLUMN
# SECONDLY WE USE THE INDEXES OF THE OUTLIERS TO REMOVE THEM FROM THE DATAFRAME BY CREATING A LIST
#DEFINE A FUNCTION CALLED OULIERS
# IQR =Q3 - Q1
# +/- 1.5 * IQR
# SET THE UPPER AND LOWER BOUNDRIES
Q1=churn.creditscore.quantile(0.25)
Q3=churn.creditscore.quantile(0.75)
Q1,Q3
(584.0, 717.0)
plt.hist(churn['creditscore'],)
plt.title('distribution of total creditscore')
plt.show()
sns.distplot(churn['creditscore'])
<AxesSubplot:xlabel='creditscore', ylabel='Density'>
IQR=Q3-Q1
IQR
133.0
lower_limit = Q1-(1.5*IQR)
lower_limit
384.5
upper_limit=Q3+(1.5*IQR)
upper_limit
916.5
lower_limit, upper_limit
(384.5, 916.5)
churn[(churn.creditscore<lower_limit)|(churn.creditscore>upper_limit)]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | Obinna | 376.00 | Germany | Female | 29.00 | 4.00 | 115046.74 | 4.00 | 0.00 | 119346.88 | 1.00 |
| 949 | Lin | 376.00 | France | Female | 46.00 | 6.00 | 0.00 | 1.00 | 0.00 | 157333.69 | 1.00 |
| 1200 | Thomas | 363.00 | Spain | Female | 28.00 | 6.00 | 146098.43 | 3.00 | 0.00 | 100615.14 | 1.00 |
| 1412 | Panicucci | 359.00 | France | Female | 44.00 | 6.00 | 128747.69 | 1.00 | 0.00 | 146955.71 | 1.00 |
| 1638 | Azubuike | 350.00 | Spain | Male | 54.00 | 1.00 | 152677.48 | 1.00 | 1.00 | 191973.49 | 1.00 |
| 1845 | Campbell | 350.00 | Germany | Male | 39.00 | 0.00 | 109733.20 | 2.00 | 0.00 | 123602.11 | 1.00 |
| 1969 | Aikenhead | 358.00 | Spain | Female | 52.00 | 8.00 | 143542.36 | 3.00 | 0.00 | 141959.11 | 1.00 |
| 2480 | Chou | 351.00 | Germany | Female | 57.00 | 4.00 | 163146.46 | 1.00 | 0.00 | 169621.69 | 1.00 |
| 2586 | Ozoemena | 365.00 | Germany | Male | 30.00 | 0.00 | 127760.07 | 1.00 | 0.00 | 81537.85 | 1.00 |
| 5511 | Ijendu | 383.00 | Spain | Female | 48.00 | 8.00 | 95808.19 | 1.00 | 0.00 | 137702.01 | 1.00 |
| 8171 | Ch'ien | 367.00 | Spain | Male | 42.00 | 6.00 | 93608.28 | 1.00 | 0.00 | 168816.73 | 1.00 |
| 8740 | Onyekachi | 350.00 | France | Male | 51.00 | 10.00 | 0.00 | 1.00 | 1.00 | 125823.79 | 1.00 |
| 8779 | Lin | 350.00 | France | Female | 60.00 | 3.00 | 0.00 | 1.00 | 0.00 | 113796.15 | 1.00 |
| 9227 | Watts | 382.00 | Spain | Male | 36.00 | 0.00 | 0.00 | 1.00 | 1.00 | 179540.73 | 1.00 |
| 9373 | Loggia | 373.00 | France | Male | 42.00 | 7.00 | 0.00 | 1.00 | 0.00 | 77786.37 | 1.00 |
| 9641 | Maslow | 350.00 | France | Female | 40.00 | 0.00 | 111098.85 | 1.00 | 1.00 | 172321.21 | 1.00 |
(churn['creditscore']<916.5)
0 True
1 True
2 True
3 True
5 True
...
9997 True
10006 True
10025 True
10030 True
10031 True
Name: creditscore, Length: 9973, dtype: bool
(churn['creditscore']>384.5)
0 True
1 True
2 True
3 True
5 True
...
9997 True
10006 True
10025 True
10030 True
10031 True
Name: creditscore, Length: 9973, dtype: bool
churn2=churn[(churn['creditscore']<916.5)&(churn['creditscore']>384.5)]
churn2['creditscore'].plot(kind='box', figsize=(3,4), patch_artist=True)
<AxesSubplot:>
churn2.shape
(9957, 11)
plt.hist(churn2['creditscore'])
plt.title('distribution of total creditscore')
plt.show()
sns.distplot(churn2['creditscore'])
<AxesSubplot:xlabel='creditscore', ylabel='Density'>
churn2=churn
#cross check by visualizing the box plot
plot_boxplot(churn,'creditscore')
plot_boxplot(churn,'age')
#SAVE THE NEW DATA FRAME IN PANDAS
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.shape
(9973, 11)
churn.describe()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|
| count | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 | 9973.00 |
| mean | 650.49 | 38.93 | 5.01 | 76498.54 | 1.53 | 0.52 | 100093.53 | 0.20 |
| std | 96.63 | 10.50 | 2.89 | 62383.80 | 0.58 | 0.50 | 57494.40 | 0.40 |
| min | 350.00 | 18.00 | 0.00 | 0.00 | 1.00 | 0.00 | 11.58 | 0.00 |
| 25% | 584.00 | 32.00 | 3.00 | 0.00 | 1.00 | 0.00 | 51099.56 | 0.00 |
| 50% | 652.00 | 37.00 | 5.00 | 97234.58 | 1.00 | 1.00 | 100187.43 | 0.00 |
| 75% | 717.00 | 44.00 | 7.00 | 127642.44 | 2.00 | 1.00 | 149381.32 | 0.00 |
| max | 850.00 | 92.00 | 10.00 | 250898.09 | 4.00 | 1.00 | 199992.48 | 1.00 |
churn.boxplot()
<AxesSubplot:>
sns.pairplot(churn)
<seaborn.axisgrid.PairGrid at 0x1b564768b20>
churn.corr()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|
| creditscore | 1.00 | -0.00 | 0.00 | 0.01 | 0.01 | 0.03 | -0.00 | -0.03 |
| age | -0.00 | 1.00 | -0.01 | 0.03 | -0.03 | 0.09 | -0.01 | 0.29 |
| tenure | 0.00 | -0.01 | 1.00 | -0.01 | 0.01 | -0.03 | 0.01 | -0.01 |
| balance | 0.01 | 0.03 | -0.01 | 1.00 | -0.30 | -0.01 | 0.01 | 0.12 |
| numofproducts | 0.01 | -0.03 | 0.01 | -0.30 | 1.00 | 0.01 | 0.01 | -0.05 |
| isactivemember | 0.03 | 0.09 | -0.03 | -0.01 | 0.01 | 1.00 | -0.01 | -0.16 |
| estimatedsalary | -0.00 | -0.01 | 0.01 | 0.01 | 0.01 | -0.01 | 1.00 | 0.01 |
| exited | -0.03 | 0.29 | -0.01 | 0.12 | -0.05 | -0.16 | 0.01 | 1.00 |
plt.figure(figsize=(20,20))
sns.heatmap(churn.corr(), vmin=-1, cmap="plasma_r", annot=True)
#same thing can be seen from the correlation as well
<AxesSubplot:>
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.groupby('estimatedsalary').mean()
| creditscore | age | tenure | balance | numofproducts | isactivemember | exited | |
|---|---|---|---|---|---|---|---|
| estimatedsalary | |||||||
| 11.58 | 709.00 | 45.00 | 4.00 | 122917.71 | 1.00 | 1.00 | 1.00 |
| 90.07 | 625.00 | 31.00 | 5.00 | 0.00 | 2.00 | 1.00 | 0.00 |
| 91.75 | 645.00 | 59.00 | 8.00 | 121669.93 | 2.00 | 0.00 | 1.00 |
| 96.27 | 710.00 | 38.00 | 2.00 | 0.00 | 2.00 | 0.00 | 0.00 |
| 106.67 | 629.00 | 40.00 | 9.00 | 0.00 | 1.00 | 0.00 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 199909.32 | 620.00 | 28.00 | 8.00 | 0.00 | 2.00 | 1.00 | 0.00 |
| 199929.17 | 569.00 | 37.00 | 9.00 | 178755.84 | 1.00 | 0.00 | 0.00 |
| 199953.33 | 608.00 | 27.00 | 4.00 | 153325.10 | 1.00 | 1.00 | 0.00 |
| 199970.74 | 639.00 | 41.00 | 5.00 | 98635.77 | 1.00 | 0.00 | 0.00 |
| 199992.48 | 685.00 | 42.00 | 2.00 | 0.00 | 2.00 | 0.00 | 0.00 |
9970 rows × 7 columns
churn.groupby('creditscore').min()
| surname | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|
| creditscore | ||||||||||
| 350.00 | Azubuike | France | Female | 39.00 | 0.00 | 0.00 | 1.00 | 0.00 | 113796.15 | 1.00 |
| 351.00 | Chou | Germany | Female | 57.00 | 4.00 | 163146.46 | 1.00 | 0.00 | 169621.69 | 1.00 |
| 358.00 | Aikenhead | Spain | Female | 52.00 | 8.00 | 143542.36 | 3.00 | 0.00 | 141959.11 | 1.00 |
| 359.00 | Panicucci | France | Female | 44.00 | 6.00 | 128747.69 | 1.00 | 0.00 | 146955.71 | 1.00 |
| 363.00 | Thomas | Spain | Female | 28.00 | 6.00 | 146098.43 | 3.00 | 0.00 | 100615.14 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 846.00 | Baxter | France | Female | 34.00 | 0.00 | 0.00 | 1.00 | 0.00 | 3440.47 | 0.00 |
| 847.00 | Beam | France | Female | 31.00 | 3.00 | 0.00 | 1.00 | 0.00 | 16025.17 | 0.00 |
| 848.00 | Hung | Germany | Female | 22.00 | 3.00 | 90018.45 | 1.00 | 0.00 | 30876.84 | 0.00 |
| 849.00 | Chou | France | Female | 27.00 | 0.00 | 0.00 | 1.00 | 0.00 | 17294.12 | 0.00 |
| 850.00 | Abbott | France | Female | 19.00 | 0.00 | 0.00 | 1.00 | 0.00 | 705.18 | 0.00 |
460 rows × 10 columns
churn[(churn['creditscore']>=600)|(churn['geography']=='france')]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 3 | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| 10 | H? | 684.00 | France | Male | 27.00 | 2.00 | 134603.88 | 1.00 | 1.00 | 71725.73 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
| 10006 | Nepean | 659.00 | France | Male | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0.00 |
| 10025 | Obijiaku | 771.00 | France | Male | 39.00 | 5.00 | 0.00 | 2.00 | 0.00 | 96270.64 | 0.00 |
| 10030 | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0.00 |
6947 rows × 11 columns
list[churn[(churn['creditscore']<600)|(churn['geography']=='france')]]
list[ surname creditscore geography gender age tenure balance \
2 Onio 502.00 France Female 42.00 8.00 159660.80
8 Obinna 376.00 Germany Female 29.00 4.00 115046.74
9 He 501.00 France Male 44.00 4.00 142051.07
11 Bearce 528.00 France Male 31.00 6.00 102016.72
12 Andrews 497.00 Spain Male 24.00 3.00 0.00
... ... ... ... ... ... ... ...
9981 Douglas 479.00 France Male 34.00 5.00 117593.48
9983 Ch'en 479.00 Spain Male 35.00 4.00 125920.98
9985 McGregor 512.00 Germany Female 40.00 8.00 153537.57
9987 Thompson 518.00 France Male 42.00 7.00 151027.05
9991 Barker 583.00 France Male 33.00 7.00 122531.86
numofproducts isactivemember estimatedsalary exited
2 3.00 0.00 113931.57 1.00
8 4.00 0.00 119346.88 1.00
9 2.00 1.00 74940.50 0.00
11 2.00 0.00 80181.12 0.00
12 2.00 0.00 76390.01 0.00
... ... ... ... ...
9981 2.00 0.00 113308.29 0.00
9983 1.00 1.00 20393.44 0.00
9985 2.00 0.00 23101.13 0.00
9987 2.00 0.00 119377.36 0.00
9991 1.00 0.00 13549.24 0.00
[3026 rows x 11 columns]]
churn['creditscore'].isin(['502.0','518.0'])
0 False
1 False
2 False
3 False
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 False
Name: creditscore, Length: 9973, dtype: bool
[churn['creditscore']>=502]
[0 True
1 True
2 True
3 True
5 True
...
9997 True
10006 True
10025 True
10030 True
10031 True
Name: creditscore, Length: 9973, dtype: bool]
[churn['creditscore']<=518]
[0 False
1 False
2 True
3 False
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 False
Name: creditscore, Length: 9973, dtype: bool]
sum(churn['creditscore'].between(502,518))
271
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn['geography']
0 France
1 Spain
2 France
3 France
5 Spain
...
9997 Spain
10006 France
10025 France
10030 Germany
10031 France
Name: geography, Length: 9973, dtype: object
churn['geography'].str.contains('spain',case=False)
0 False
1 True
2 False
3 False
5 True
...
9997 True
10006 False
10025 False
10030 False
10031 False
Name: geography, Length: 9973, dtype: bool
churn[churn['geography'].str.contains('spain',case=False)]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| 12 | Andrews | 497.00 | Spain | Male | 24.00 | 3.00 | 0.00 | 2.00 | 0.00 | 76390.01 | 0.00 |
| 18 | Scott | 635.00 | Spain | Female | 35.00 | 7.00 | 0.00 | 2.00 | 1.00 | 65951.65 | 0.00 |
| 24 | Henderson | 549.00 | Spain | Female | 24.00 | 9.00 | 0.00 | 2.00 | 1.00 | 14406.41 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9976 | Graham | 764.00 | Spain | Male | 38.00 | 4.00 | 113607.47 | 1.00 | 0.00 | 91094.46 | 0.00 |
| 9978 | Aldridge | 573.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 2.00 | 0.00 | 107124.17 | 0.00 |
| 9979 | Flynn | 702.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 1.00 | 0.00 | 59207.41 | 1.00 |
| 9983 | Ch'en | 479.00 | Spain | Male | 35.00 | 4.00 | 125920.98 | 1.00 | 1.00 | 20393.44 | 0.00 |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
2471 rows × 11 columns
len(churn['geography'].str.contains('spain',case=False))
9973
churn['geography'].nunique()
3
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.groupby('geography')['creditscore'].max()
geography France 850.00 Germany 850.00 Spain 850.00 Name: creditscore, dtype: float64
churn.groupby('geography')['creditscore'].mean()
geography France 649.61 Germany 651.55 Spain 651.19 Name: creditscore, dtype: float64
churn.groupby('geography')['creditscore'].mean().sort_values(ascending=False)
geography Germany 651.55 Spain 651.19 France 649.61 Name: creditscore, dtype: float64
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
churn['tenure']
0 2.00
1 1.00
2 8.00
3 1.00
5 8.00
...
9997 6.00
10006 6.00
10025 5.00
10030 3.00
10031 4.00
Name: tenure, Length: 9973, dtype: float64
churn['tenure']==2
0 True
1 False
2 False
3 False
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 False
Name: tenure, Length: 9973, dtype: bool
churn[churn['tenure']==2]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 10 | H? | 684.00 | France | Male | 27.00 | 2.00 | 134603.88 | 1.00 | 1.00 | 71725.73 | 0.00 |
| 33 | Young | 756.00 | Germany | Male | 36.00 | 2.00 | 136815.64 | 1.00 | 1.00 | 170041.95 | 0.00 |
| 49 | Osborne | 556.00 | France | Female | 61.00 | 2.00 | 117419.35 | 1.00 | 1.00 | 94153.83 | 0.00 |
| 50 | Lavine | 834.00 | France | Female | 49.00 | 2.00 | 131394.56 | 1.00 | 0.00 | 194365.76 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9922 | Emenike | 645.00 | Germany | Male | 41.00 | 2.00 | 93925.30 | 1.00 | 0.00 | 123982.14 | 1.00 |
| 9926 | Endrizzi | 739.00 | France | Male | 58.00 | 2.00 | 101579.28 | 1.00 | 1.00 | 72168.53 | 0.00 |
| 9960 | Fu | 784.00 | Spain | Male | 23.00 | 2.00 | 0.00 | 1.00 | 1.00 | 6847.73 | 0.00 |
| 9970 | Burke | 550.00 | France | Male | 47.00 | 2.00 | 0.00 | 2.00 | 1.00 | 97057.28 | 0.00 |
| 9980 | Scott | 740.00 | Germany | Male | 33.00 | 2.00 | 126524.11 | 1.00 | 0.00 | 136869.31 | 0.00 |
1045 rows × 11 columns
len(churn[churn['tenure']==2])
1045
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn['gender'].value_counts()
Male 5438 Female 4535 Name: gender, dtype: int64
churn['balance'].max()
250898.09
churn[churn['balance'].max()==churn['balance']]['gender']
2099 Male Name: gender, dtype: object
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn_index=churn['estimatedsalary'].sort_values(ascending=False).head().index
churn.iloc[churn_index]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6692 | Christmas | 654.00 | Spain | Female | 36.00 | 5.00 | 0.00 | 2.00 | 0.00 | 157238.05 | 0.00 |
| 3596 | Watts | 446.00 | France | Female | 51.00 | 4.00 | 105056.13 | 1.00 | 0.00 | 70613.52 | 0.00 |
| 4433 | Chao | 707.00 | France | Female | 44.00 | 6.00 | 0.00 | 2.00 | 1.00 | 192542.17 | 0.00 |
| 9142 | Iheanacho | 749.00 | Spain | Male | 34.00 | 2.00 | 0.00 | 1.00 | 0.00 | 174189.04 | 1.00 |
| 8972 | Afanasyeva | 518.00 | Spain | Male | 50.00 | 4.00 | 0.00 | 1.00 | 0.00 | 107112.25 | 1.00 |
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn['surname']
0 Hargrave
1 Hill
2 Onio
3 Boni
5 Chu
...
9997 T'ao
10006 Nepean
10025 Obijiaku
10030 Sabbatini
10031 Walker
Name: surname, Length: 9973, dtype: object
churn['surname'].isin(['Watts'])
0 False
1 False
2 False
3 False
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 False
Name: surname, Length: 9973, dtype: bool
churn[churn['surname'].isin(['Watts'])]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 529 | Watts | 527.00 | Spain | Male | 37.00 | 5.00 | 93722.73 | 2.00 | 1.00 | 139093.73 | 0.00 |
| 3596 | Watts | 446.00 | France | Female | 51.00 | 4.00 | 105056.13 | 1.00 | 0.00 | 70613.52 | 0.00 |
| 9227 | Watts | 382.00 | Spain | Male | 36.00 | 0.00 | 0.00 | 1.00 | 1.00 | 179540.73 | 1.00 |
| 9635 | Watts | 581.00 | France | Female | 24.00 | 3.00 | 95508.20 | 1.00 | 1.00 | 45755.00 | 0.00 |
| 9826 | Watts | 820.00 | France | Female | 39.00 | 1.00 | 104614.29 | 1.00 | 0.00 | 61538.43 | 1.00 |
# WE CAN ALSO USE STR CONTAINS
churn[churn['surname'].str.contains('Watts')]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 529 | Watts | 527.00 | Spain | Male | 37.00 | 5.00 | 93722.73 | 2.00 | 1.00 | 139093.73 | 0.00 |
| 3596 | Watts | 446.00 | France | Female | 51.00 | 4.00 | 105056.13 | 1.00 | 0.00 | 70613.52 | 0.00 |
| 9227 | Watts | 382.00 | Spain | Male | 36.00 | 0.00 | 0.00 | 1.00 | 1.00 | 179540.73 | 1.00 |
| 9635 | Watts | 581.00 | France | Female | 24.00 | 3.00 | 95508.20 | 1.00 | 1.00 | 45755.00 | 0.00 |
| 9826 | Watts | 820.00 | France | Female | 39.00 | 1.00 | 104614.29 | 1.00 | 0.00 | 61538.43 | 1.00 |
churn['age'].unique
<bound method Series.unique of 0 42.00
1 41.00
2 42.00
3 39.00
5 44.00
...
9997 35.00
10006 36.00
10025 39.00
10030 42.00
10031 28.00
Name: age, Length: 9973, dtype: float64>
churn['age'].sort_values(ascending=False)
6776 92.00
6460 92.00
2465 88.00
3040 85.00
3538 84.00
...
9949 18.00
2143 18.00
2148 18.00
4563 18.00
2028 18.00
Name: age, Length: 9973, dtype: float64
churn['age'].sort_values(ascending=False).head()
6776 92.00 6460 92.00 2465 88.00 3040 85.00 3538 84.00 Name: age, dtype: float64
index=churn['age'].sort_values(ascending=False).head().index
churn.iloc[index]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6805 | Costa | 825.00 | Germany | Male | 37.00 | 6.00 | 118050.79 | 1.00 | 1.00 | 52301.15 | 0.00 |
| 6489 | Johnston | 651.00 | Germany | Female | 34.00 | 10.00 | 148962.46 | 1.00 | 0.00 | 66389.43 | 1.00 |
| 2474 | Yuryeva | 617.00 | France | Female | 27.00 | 4.00 | 0.00 | 2.00 | 0.00 | 190269.21 | 0.00 |
| 3049 | Mahon | 835.00 | Germany | Female | 29.00 | 10.00 | 130420.20 | 2.00 | 0.00 | 106276.55 | 0.00 |
| 3547 | Crawford | 672.00 | France | Male | 43.00 | 5.00 | 0.00 | 1.00 | 0.00 | 63833.09 | 0.00 |
churn['age']
0 42.00
1 41.00
2 42.00
3 39.00
5 44.00
...
9997 35.00
10006 36.00
10025 39.00
10030 42.00
10031 28.00
Name: age, Length: 9973, dtype: float64
churn[churn['age']==50.0]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 142 | Arthur | 589.00 | Germany | Female | 50.00 | 5.00 | 144895.05 | 2.00 | 1.00 | 34941.23 | 0.00 |
| 191 | Cocci | 804.00 | France | Female | 50.00 | 4.00 | 0.00 | 1.00 | 1.00 | 8546.87 | 1.00 |
| 209 | He | 516.00 | Spain | Male | 50.00 | 5.00 | 0.00 | 1.00 | 1.00 | 146145.93 | 1.00 |
| 235 | O'Sullivan | 751.00 | Germany | Male | 50.00 | 2.00 | 96888.39 | 1.00 | 0.00 | 77206.25 | 1.00 |
| 315 | Balashov | 614.00 | France | Male | 50.00 | 4.00 | 137104.47 | 1.00 | 0.00 | 127166.49 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9362 | Miah | 582.00 | France | Male | 50.00 | 2.00 | 148942.00 | 1.00 | 1.00 | 116944.30 | 0.00 |
| 9655 | Baldwin | 744.00 | Germany | Female | 50.00 | 1.00 | 121498.11 | 2.00 | 1.00 | 106061.47 | 1.00 |
| 9899 | Okeke | 771.00 | France | Male | 50.00 | 3.00 | 105229.72 | 1.00 | 1.00 | 16281.68 | 1.00 |
| 9941 | Palerma | 632.00 | Germany | Female | 50.00 | 5.00 | 107959.39 | 1.00 | 1.00 | 6985.34 | 1.00 |
| 9992 | Smith | 610.00 | Germany | Male | 50.00 | 1.00 | 113957.01 | 2.00 | 0.00 | 196526.55 | 1.00 |
133 rows × 11 columns
len(churn[churn['age']==50.0])
133
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn['balance']>=83807.86
0 False
1 True
2 True
3 False
5 True
...
9997 False
10006 True
10025 False
10030 False
10031 True
Name: balance, Length: 9973, dtype: bool
churn[churn['balance']>=83807.86]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| 8 | Obinna | 376.00 | Germany | Female | 29.00 | 4.00 | 115046.74 | 4.00 | 0.00 | 119346.88 | 1.00 |
| 9 | He | 501.00 | France | Male | 44.00 | 4.00 | 142051.07 | 2.00 | 1.00 | 74940.50 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9993 | Azikiwe | 637.00 | France | Female | 33.00 | 7.00 | 103377.81 | 1.00 | 0.00 | 84419.78 | 0.00 |
| 9995 | P'eng | 774.00 | France | Male | 40.00 | 9.00 | 93017.47 | 2.00 | 0.00 | 191608.97 | 0.00 |
| 9996 | Diribe | 677.00 | France | Female | 58.00 | 1.00 | 90022.85 | 1.00 | 1.00 | 2988.28 | 0.00 |
| 10006 | Nepean | 659.00 | France | Male | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0.00 |
| 10031 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0.00 |
5627 rows × 11 columns
churn['balance'].unique()
array([ 0. , 83807.86, 159660.8 , ..., 123841.49, 75075.31,
130142.79])
(churn['gender']=='Female')
0 True
1 True
2 True
3 True
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 True
Name: gender, Length: 9973, dtype: bool
(churn['geography']=='France')
0 True
1 False
2 True
3 True
5 False
...
9997 False
10006 True
10025 True
10030 False
10031 True
Name: geography, Length: 9973, dtype: bool
(churn['gender']=='Female')&(churn['geography']=='France')
0 True
1 False
2 True
3 True
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 True
Length: 9973, dtype: bool
churn[(churn['gender']=='Female')&(churn['geography']=='France')]['balance']
0 0.00
2 159660.80
3 0.00
13 0.00
14 0.00
...
9988 144751.81
9993 103377.81
9994 0.00
9996 90022.85
10031 130142.79
Name: balance, Length: 2257, dtype: float64
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn['gender'].unique()
array(['Female', 'Male'], dtype=object)
(churn['gender']=='Male')
0 False
1 False
2 False
3 False
5 True
...
9997 True
10006 True
10025 True
10030 True
10031 False
Name: gender, Length: 9973, dtype: bool
(churn['geography']=='Spain')
0 False
1 True
2 False
3 False
5 True
...
9997 True
10006 False
10025 False
10030 False
10031 False
Name: geography, Length: 9973, dtype: bool
(churn['gender']=='Male')&(churn['geography']=='Spain')
0 False
1 False
2 False
3 False
5 True
...
9997 True
10006 False
10025 False
10030 False
10031 False
Length: 9973, dtype: bool
churn[(churn['gender']=='Male')&(churn['geography']=='Spain')]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| 12 | Andrews | 497.00 | Spain | Male | 24.00 | 3.00 | 0.00 | 2.00 | 0.00 | 76390.01 | 0.00 |
| 25 | Muldrow | 587.00 | Spain | Male | 45.00 | 6.00 | 0.00 | 1.00 | 0.00 | 158684.81 | 0.00 |
| 43 | Watson | 490.00 | Spain | Male | 31.00 | 3.00 | 145260.23 | 1.00 | 1.00 | 114066.77 | 0.00 |
| 44 | Lorenzo | 804.00 | Spain | Male | 33.00 | 7.00 | 76548.60 | 1.00 | 1.00 | 98453.45 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9976 | Graham | 764.00 | Spain | Male | 38.00 | 4.00 | 113607.47 | 1.00 | 0.00 | 91094.46 | 0.00 |
| 9978 | Aldridge | 573.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 2.00 | 0.00 | 107124.17 | 0.00 |
| 9979 | Flynn | 702.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 1.00 | 0.00 | 59207.41 | 1.00 |
| 9983 | Ch'en | 479.00 | Spain | Male | 35.00 | 4.00 | 125920.98 | 1.00 | 1.00 | 20393.44 | 0.00 |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
1385 rows × 11 columns
churn[(churn['gender']=='Male')&(churn['geography']=='Spain')|(churn['balance']=='159660.8')]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| 12 | Andrews | 497.00 | Spain | Male | 24.00 | 3.00 | 0.00 | 2.00 | 0.00 | 76390.01 | 0.00 |
| 25 | Muldrow | 587.00 | Spain | Male | 45.00 | 6.00 | 0.00 | 1.00 | 0.00 | 158684.81 | 0.00 |
| 43 | Watson | 490.00 | Spain | Male | 31.00 | 3.00 | 145260.23 | 1.00 | 1.00 | 114066.77 | 0.00 |
| 44 | Lorenzo | 804.00 | Spain | Male | 33.00 | 7.00 | 76548.60 | 1.00 | 1.00 | 98453.45 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9976 | Graham | 764.00 | Spain | Male | 38.00 | 4.00 | 113607.47 | 1.00 | 0.00 | 91094.46 | 0.00 |
| 9978 | Aldridge | 573.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 2.00 | 0.00 | 107124.17 | 0.00 |
| 9979 | Flynn | 702.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 1.00 | 0.00 | 59207.41 | 1.00 |
| 9983 | Ch'en | 479.00 | Spain | Male | 35.00 | 4.00 | 125920.98 | 1.00 | 1.00 | 20393.44 | 0.00 |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
1385 rows × 11 columns
churn[(churn['gender']=='Male')&(churn['geography']=='Spain')|(churn['surname']=='Lorenzo')]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| 12 | Andrews | 497.00 | Spain | Male | 24.00 | 3.00 | 0.00 | 2.00 | 0.00 | 76390.01 | 0.00 |
| 25 | Muldrow | 587.00 | Spain | Male | 45.00 | 6.00 | 0.00 | 1.00 | 0.00 | 158684.81 | 0.00 |
| 43 | Watson | 490.00 | Spain | Male | 31.00 | 3.00 | 145260.23 | 1.00 | 1.00 | 114066.77 | 0.00 |
| 44 | Lorenzo | 804.00 | Spain | Male | 33.00 | 7.00 | 76548.60 | 1.00 | 1.00 | 98453.45 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9976 | Graham | 764.00 | Spain | Male | 38.00 | 4.00 | 113607.47 | 1.00 | 0.00 | 91094.46 | 0.00 |
| 9978 | Aldridge | 573.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 2.00 | 0.00 | 107124.17 | 0.00 |
| 9979 | Flynn | 702.00 | Spain | Male | 44.00 | 9.00 | 0.00 | 1.00 | 0.00 | 59207.41 | 1.00 |
| 9983 | Ch'en | 479.00 | Spain | Male | 35.00 | 4.00 | 125920.98 | 1.00 | 1.00 | 20393.44 | 0.00 |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
1395 rows × 11 columns
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.numofproducts.value_counts()
1.00 5069 2.00 4578 3.00 266 4.00 60 Name: numofproducts, dtype: int64
churn.numofproducts.unique()
array([1., 3., 2., 4.])
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
churn.groupby('gender')['balance'].mean().sort_values(ascending=False)
gender Male 77170.09 Female 75693.28 Name: balance, dtype: float64
sns.barplot(x='gender',y='balance',data=churn)
plt.title('gender by balance')
plt.show()
churn.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
churn_top10=churn.nlargest(10,('balance'))[['geography']]
churn_top10
| geography | |
|---|---|
| 2099 | Spain |
| 3287 | France |
| 8750 | Spain |
| 3595 | Spain |
| 6734 | Spain |
| 1074 | Germany |
| 145 | Spain |
| 1540 | France |
| 3927 | Germany |
| 2716 | France |
churn_top10=churn.nlargest(10,('balance'))[['geography']].set_index('geography')
churn_top10
| geography |
|---|
| Spain |
| France |
| Spain |
| Spain |
| Spain |
| Germany |
| Spain |
| France |
| Germany |
| France |
churn_top10=churn.nsmallest(10,('balance'))[['geography']]
churn_top10
| geography | |
|---|---|
| 0 | France |
| 3 | France |
| 12 | Spain |
| 13 | France |
| 14 | France |
| 18 | Spain |
| 24 | Spain |
| 25 | Spain |
| 26 | France |
| 27 | France |
churn_top10=churn.nlargest(10,('balance'))[['geography','gender']]
churn_top10
| geography | gender | |
|---|---|---|
| 2099 | Spain | Male |
| 3287 | France | Female |
| 8750 | Spain | Male |
| 3595 | Spain | Male |
| 6734 | Spain | Female |
| 1074 | Germany | Male |
| 145 | Spain | Female |
| 1540 | France | Female |
| 3927 | Germany | Male |
| 2716 | France | Male |
churn['estimatedsalary'].value_counts()
62825.03 3
24924.92 2
101348.88 1
182692.80 1
38466.39 1
..
2465.80 1
47799.15 1
165562.84 1
72927.68 1
38190.78 1
Name: estimatedsalary, Length: 9970, dtype: int64
sns.countplot(x='estimatedsalary',data=churn)
plt.title('no of estimatedsalary')
plt.show()
churn.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
churn['creditscore'].max()
850.0
churn['creditscore']
0 619.00
1 608.00
2 502.00
3 699.00
5 645.00
...
9997 741.00
10006 659.00
10025 771.00
10030 772.00
10031 792.00
Name: creditscore, Length: 9973, dtype: float64
[churn['creditscore'].max()==churn['creditscore']]
[0 False
1 False
2 False
3 False
5 False
...
9997 False
10006 False
10025 False
10030 False
10031 False
Name: creditscore, Length: 9973, dtype: bool]
churn[churn['creditscore'].max()==churn['creditscore']]
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 45 | Armstrong | 850.00 | France | Male | 36.00 | 7.00 | 0.00 | 1.00 | 1.00 | 40812.90 | 0.00 |
| 187 | Rozier | 850.00 | Spain | Female | 45.00 | 2.00 | 122311.21 | 1.00 | 1.00 | 19482.50 | 0.00 |
| 207 | Chiemezie | 850.00 | Spain | Male | 30.00 | 2.00 | 141040.01 | 1.00 | 1.00 | 5978.20 | 0.00 |
| 230 | Stevenson | 850.00 | France | Male | 33.00 | 10.00 | 0.00 | 1.00 | 0.00 | 4861.72 | 1.00 |
| 266 | Welch | 850.00 | Germany | Male | 38.00 | 3.00 | 54901.01 | 1.00 | 1.00 | 140075.55 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9598 | Glover | 850.00 | France | Female | 28.00 | 9.00 | 0.00 | 2.00 | 1.00 | 164864.67 | 0.00 |
| 9663 | Muir | 850.00 | Spain | Male | 71.00 | 10.00 | 69608.14 | 1.00 | 0.00 | 97893.40 | 1.00 |
| 9705 | Ward | 850.00 | France | Male | 68.00 | 5.00 | 169445.40 | 1.00 | 1.00 | 186335.07 | 0.00 |
| 9948 | Greco | 850.00 | France | Female | 34.00 | 6.00 | 101266.51 | 1.00 | 0.00 | 33501.98 | 0.00 |
| 9966 | Wan | 850.00 | France | Female | 47.00 | 9.00 | 137301.87 | 1.00 | 0.00 | 44351.77 | 0.00 |
231 rows × 11 columns
churn[churn['creditscore'].max()==churn['creditscore']]['geography']
45 France
187 Spain
207 Spain
230 France
266 Germany
...
9598 France
9663 Spain
9705 France
9948 France
9966 France
Name: geography, Length: 231, dtype: object
churn.columns
Index(['surname', 'creditscore', 'geography', 'gender', 'age', 'tenure',
'balance', 'numofproducts', 'isactivemember', 'estimatedsalary',
'exited'],
dtype='object')
sns.scatterplot(x='estimatedsalary',y='balance',data=churn)
<AxesSubplot:xlabel='estimatedsalary', ylabel='balance'>
churn
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 3 | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
| 10006 | Nepean | 659.00 | France | Male | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0.00 |
| 10025 | Obijiaku | 771.00 | France | Male | 39.00 | 5.00 | 0.00 | 2.00 | 0.00 | 96270.64 | 0.00 |
| 10030 | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0.00 |
9973 rows × 11 columns
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn.geography.unique()
array(['France', 'Spain', 'Germany'], dtype=object)
sns.set(style='whitegrid')
plt.figure(figsize=(14, 7))
labels=['France', 'Spain','Germany'],
plt.pie(churn['geography'].value_counts(),labels=['France', 'Spain','Germany'],explode=[0.1,0.1,0.1],
autopct='%1.2f%%',colors=['#E37383','#FFC0CB'], startangle=90)
plt.title('geography')
plt.axis('equal')
plt.show()
plt.figure(figsize=(8,5))
sns.countplot('gender', data = churn, color='#00ddff', saturation=0.9)
<AxesSubplot:xlabel='gender', ylabel='count'>
plt.figure(figsize=(20,20))
sns.heatmap(churn.corr(), vmin=-1, cmap="plasma_r", annot=True)
#same thing can be seen from the correlation as well
<AxesSubplot:>
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
sns.scatterplot(x='age',y='exited',data=churn)
<AxesSubplot:xlabel='age', ylabel='exited'>
sns.scatterplot(x='age',y='exited',data=churn,hue='gender',palette='RdYlBu')
<AxesSubplot:xlabel='age', ylabel='exited'>
sns.barplot(x='gender',y='exited',data=churn,palette='RdYlBu')
<AxesSubplot:xlabel='gender', ylabel='exited'>
sns.histplot(churn['tenure'],kde=True,bins=15)
<AxesSubplot:xlabel='tenure', ylabel='Count'>
sns.boxplot(x='geography',y='exited',data=churn,hue='gender',palette='YlGnBu')
<AxesSubplot:xlabel='geography', ylabel='exited'>
sns.histplot(churn['estimatedsalary'],kde=True,bins=15)
<AxesSubplot:xlabel='estimatedsalary', ylabel='Count'>
sns.barplot(x='gender',y='estimatedsalary',data=churn,palette='RdYlBu')
<AxesSubplot:xlabel='gender', ylabel='estimatedsalary'>
sns.boxplot(x='geography',y='estimatedsalary',data=churn,hue='gender',palette='YlGnBu')
<AxesSubplot:xlabel='geography', ylabel='estimatedsalary'>
sns.histplot(churn['balance'],kde=True,bins=15)
<AxesSubplot:xlabel='balance', ylabel='Count'>
sns.barplot(x='gender',y='balance',data=churn,palette='RdYlBu')
<AxesSubplot:xlabel='gender', ylabel='balance'>
sns.boxplot(x='geography',y='balance',data=churn,hue='gender',palette='YlGnBu')
<AxesSubplot:xlabel='geography', ylabel='balance'>
sns.histplot(churn['creditscore'],kde=True,bins=15)
<AxesSubplot:xlabel='creditscore', ylabel='Count'>
sns.barplot(x='gender',y='creditscore',data=churn,palette='RdYlBu')
<AxesSubplot:xlabel='gender', ylabel='creditscore'>
sns.boxplot(x='geography',y='creditscore',data=churn,hue='gender',palette='YlGnBu')
<AxesSubplot:xlabel='geography', ylabel='creditscore'>
sns.stripplot(x='geography',y='creditscore',data=churn,hue='gender',dodge=True ,palette='YlGnBu')
<AxesSubplot:xlabel='geography', ylabel='creditscore'>
sns.stripplot(x='geography',y='exited',data=churn,hue='gender',dodge=True ,palette='YlGnBu')
<AxesSubplot:xlabel='geography', ylabel='exited'>
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
sns.jointplot(x='exited',y='balance',data=churn,kind='reg')
<seaborn.axisgrid.JointGrid at 0x1b50039c3a0>
sns.jointplot(x='exited',y='estimatedsalary',data=churn)
<seaborn.axisgrid.JointGrid at 0x1b50016b490>
sns.jointplot(x='creditscore',y='estimatedsalary',data=churn,)
<seaborn.axisgrid.JointGrid at 0x1b57ff08a00>
sns.jointplot(x='creditscore',y='estimatedsalary',data=churn,kind='kde')
<seaborn.axisgrid.JointGrid at 0x1b57ed06a00>
sns.jointplot(x='creditscore',y='estimatedsalary',data=churn,kind='kde',shade=True)
<seaborn.axisgrid.JointGrid at 0x1b57eb51310>
sns.jointplot(x='creditscore',y='estimatedsalary',data=churn,kind='kde',shade=True,cmap='YlGnBu')
<seaborn.axisgrid.JointGrid at 0x1b57e98f940>
sns.jointplot(x='creditscore',y='estimatedsalary',data=churn,kind='hex',cmap='YlGnBu')
<seaborn.axisgrid.JointGrid at 0x1b57e7cdaf0>
sns.countplot(x='exited',hue='gender',data=churn)
<AxesSubplot:xlabel='exited', ylabel='count'>
sns.countplot(x='exited',hue='geography',data=churn)
<AxesSubplot:xlabel='exited', ylabel='count'>
churn['age'].plot.hist()
<AxesSubplot:ylabel='Frequency'>
churn['balance'].plot.hist(bins=20,figsize=(10,5))
<AxesSubplot:ylabel='Frequency'>
churn['creditscore'].plot.hist(bins=20,figsize=(10,5))
<AxesSubplot:ylabel='Frequency'>
churn['estimatedsalary'].plot.hist(bins=20,figsize=(10,5))
<AxesSubplot:ylabel='Frequency'>
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn['exited'].unique
<bound method Series.unique of 0 1.00
1 0.00
2 1.00
3 0.00
5 1.00
...
9997 0.00
10006 0.00
10025 0.00
10030 1.00
10031 0.00
Name: exited, Length: 9973, dtype: float64>
#visualization of this data
cols=['creditscore','geography','gender','age','tenure','balance','numofproducts','isactivemember','estimatedsalary','exited']
n_rows=2
n_cols=3
#the subplot grid and figsize of the graph
fig, axs =plt.subplots(n_rows,n_cols,figsize=(n_cols*3.2,n_rows*3.2))
for r in range(0,n_rows):
for c in range (0,n_cols):
i = r*n_cols+c #index to go through the number of columns
ax= axs[r][c] #show were to position each subplot
sns.countplot(churn[cols[i]],hue=churn['exited'],ax=ax)
ax.set_title(cols[i])
ax.legend(title='exited',loc='upper right')
plt.tight_layout
churn.pivot_table('exited',index='gender',columns='geography').plot()
<AxesSubplot:xlabel='gender'>
f, ax =plt.subplots(1,2,figsize= (18,8))
churn['exited'].value_counts().plot.pie(explode=[0,0.1,],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('exited')
ax[0].set_ylabel('')
sns.countplot('exited',data=churn,ax=ax[1])
ax[1].set_title('exited')
plt.show()
churn.head(2)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
churn_gender = churn.groupby(['gender'])[['creditscore', 'balance', 'estimatedsalary']].mean()
colors_4 = ['magenta','yellow','green','red']
colors_3 =['magenta','yellow','green']
churn_gender.plot.pie(subplots=True, figsize=(20,10), labels=churn_gender.index, autopct='%1.1f%%',colors=colors_4)
plt.show()
churn_geography = churn.groupby(['geography'])[['creditscore', 'balance', 'estimatedsalary']].mean()
churn_geography.plot.pie(subplots=True, figsize=(20,10), labels=churn_geography.index, autopct='%1.1f%%',colors=colors_4)
plt.show()
sns.factorplot('geography','exited',hue='gender',data=churn)
<seaborn.axisgrid.FacetGrid at 0x1b51c537a30>
f, ax = plt.subplots(1,2,figsize=(18,8,))
sns.violinplot('geography','age', hue='exited',data=churn,ax=ax[0])
ax[0].set_title('geography and age vs exited')
ax[0].set_yticks(range(0,110,10))
sns.violinplot('gender','age',hue='exited',data=churn,ax=ax[1])
ax[1].set_title('gender and age vs exited')
ax[1].set_yticks(range(0,110,10))
plt.show()
f, ax=plt.subplots(1,2,figsize=(12,10))
churn[churn['exited']==0].age.plot.hist(ax=ax[0],bins=20,edgecolor='black',color='red')
ax[0].set_title('exited=0')
x1=list(range(0,85,5))
ax[0].set_xticks(x1)
churn[churn['exited']==1].age.plot.hist(ax=ax[1],color='green',bins=20,edgecolor='black')
ax[1].set_title('exited=1')
x2=list(range(0,85,5))
ax[1].set_xticks(x2)
plt.show()
plt.figure(figsize=(30,30))
sns.set_context("talk",font_scale=1)
sns.set_palette("pastel")
ax = sns.countplot(y="age", hue="gender", data=churn)
ax.legend(loc='upper right',frameon=True)
plt.title('GENDER vs AGE', fontsize=18, fontweight='bold')
ax.set(xlabel='COUNT OF GENDER',ylabel='AGE')
plt.show()
plt.figure(figsize=(6,6))
sns.set_context("talk",font_scale=1)
sns.set_palette("pastel")
ax = sns.countplot(y="gender", hue="exited", data=churn)
ax.legend(loc='upper right',frameon=True)
plt.title('exited vs gender', fontsize=8, fontweight='bold')
ax.set(xlabel='COUNT OF exited ',ylabel='gender')
plt.show()
plt.figure(figsize=(8,8))
plt.title('Correlation Analysis',color='Red',fontsize=20,pad=40)
corr = churn.corr()
mask = np.triu(np.ones_like(corr,dtype = bool))
sns.heatmap(churn.corr(),mask=mask,annot=True,linewidths=.5);
plt.xticks(rotation=60)
plt.yticks(rotation = 60)
plt.show()
colors = ['#E94B3C','#2D2926']
exited = churn[churn['exited'] == 1].describe().T
not_exited = churn[churn['exited'] == 0].describe().T
fig,ax = plt.subplots(nrows = 1,ncols = 2,figsize = (5,5))
plt.subplot(1,2,1)
sns.heatmap(exited[['mean']],annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',cbar = False,fmt = '.2f')
plt.title('exited Customers');
plt.subplot(1,2,2)
sns.heatmap(not_exited[['mean']],annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',cbar = False,fmt = '.2f',)
plt.title('Not_exited Customers');
fig.tight_layout(pad = 0)
l = list(churn['exited'].value_counts())
circle = [l[0] / sum(l) * 100,l[1] / sum(l) * 100]
fig = plt.subplots(nrows = 1,ncols = 2,figsize = (20,5))
plt.subplot(1,2,1)
plt.pie(circle,labels = ['Not-exited Customer','exited Customer'],autopct = '%1.1f%%',startangle = 90,explode = (0.1,0),colors = colors,
wedgeprops = {'edgecolor' : 'black','linewidth': 1,'antialiased' : True})
plt.title('exited - Not-exited %');
plt.subplot(1,2,2)
ax = sns.countplot('exited',data = churn,palette = colors,edgecolor = 'black')
for rect in ax.patches:
ax.text(rect.get_x() + rect.get_width() / 2, rect.get_height() + 2, rect.get_height(), horizontalalignment='center', fontsize = 11)
plt.title('Number of Exited - Not-Exited Customers');
plt.show()
churn.head(5)
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 3 | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
churn.head()
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 3 | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
churn=churn.drop(churn.columns[[0]],axis=1)
churn.dtypes
creditscore float64 geography object gender object age float64 tenure float64 balance float64 numofproducts float64 isactivemember float64 estimatedsalary float64 exited float64 dtype: object
churn_catg
| surname | geography | gender | |
|---|---|---|---|
| 0 | Hargrave | France | Female |
| 1 | Hill | Spain | Female |
| 2 | Onio | France | Female |
| 3 | Boni | France | Female |
| 4 | NaN | NaN | NaN |
| ... | ... | ... | ... |
| 10027 | Johnstone | France | Male |
| 10028 | Liu | France | Female |
| 10029 | Liu | France | Female |
| 10030 | Sabbatini | Germany | Male |
| 10031 | Walker | France | Female |
10032 rows × 3 columns
churn_cont.head(2)
| rownumber | customerid | creditscore | age | tenure | balance | numofproducts | hascrcard | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.00 | 15634602.00 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 2.00 | NaN | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 0.00 | 1.00 | 112542.58 | 0.00 |
churn1
| surname | creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hargrave | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | Hill | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | Onio | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 3 | Boni | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | Chu | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9997 | T'ao | 741.00 | Spain | Male | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 0.00 |
| 10006 | Nepean | 659.00 | France | Male | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0.00 |
| 10025 | Obijiaku | 771.00 | France | Male | 39.00 | 5.00 | 0.00 | 2.00 | 0.00 | 96270.64 | 0.00 |
| 10030 | Sabbatini | 772.00 | Germany | Male | 42.00 | 3.00 | 75075.31 | 2.00 | 0.00 | 92888.52 | 1.00 |
| 10031 | Walker | 792.00 | France | Female | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0.00 |
9913 rows × 11 columns
churn.head()
| creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
#geography1={'geography':{'France':0,'Spain':1,'Germany':2}}
#churn=churn.replace(geography1)
churn.head()
| creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 |
| 1 | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 |
| 2 | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 |
| 3 | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 |
| 5 | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 |
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
churn['geography1']=le.fit_transform(churn.geography)
churn.head()
| creditscore | geography | gender | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | France | Female | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 | 0 |
| 1 | 608.00 | Spain | Female | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 | 2 |
| 2 | 502.00 | France | Female | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 | 0 |
| 3 | 699.00 | France | Female | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 | 0 |
| 5 | 645.00 | Spain | Male | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 | 2 |
churn=pd.get_dummies(churn,drop_first=True)
churn.head()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 | 2 | 0 | 1 | 1 |
#churn=pd.get_dummies(churn,columns=['gender'],prefix='gender',drop_first=True)
churn.head(2)
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 | 2 | 0 | 1 | 0 |
churn['exited'].value_counts()
0.00 7943 1.00 2030 Name: exited, dtype: int64
sns.countplot(churn['exited'])
<AxesSubplot:xlabel='exited', ylabel='count'>
X= churn.drop('exited',axis=1)
y=churn['exited']
X
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 2 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9997 | 741.00 | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 2 | 0 | 1 | 1 |
| 10006 | 659.00 | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0 | 0 | 0 | 1 |
| 10025 | 771.00 | 39.00 | 5.00 | 0.00 | 2.00 | 0.00 | 96270.64 | 0 | 0 | 0 | 1 |
| 10030 | 772.00 | 42.00 | 3.00 | 75075.31 | 2.00 | 0.00 | 92888.52 | 1 | 1 | 0 | 1 |
| 10031 | 792.00 | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0 | 0 | 0 | 0 |
9973 rows × 11 columns
y
0 1.00
1 0.00
2 1.00
3 0.00
5 1.00
...
9997 0.00
10006 0.00
10025 0.00
10030 1.00
10031 0.00
Name: exited, Length: 9973, dtype: float64
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42,stratify=y)
X_train
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5800 | 624.00 | 71.00 | 7.00 | 0.00 | 2.00 | 1.00 | 108841.83 | 0 | 0 | 0 | 1 |
| 5401 | 727.00 | 28.00 | 1.00 | 0.00 | 1.00 | 0.00 | 40357.39 | 2 | 0 | 1 | 1 |
| 2369 | 709.00 | 45.00 | 4.00 | 122917.71 | 1.00 | 1.00 | 11.58 | 1 | 1 | 0 | 1 |
| 7532 | 623.00 | 35.00 | 0.00 | 130557.24 | 1.00 | 1.00 | 47880.71 | 0 | 0 | 0 | 0 |
| 3495 | 587.00 | 35.00 | 3.00 | 83286.56 | 1.00 | 0.00 | 125553.52 | 2 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9692 | 735.00 | 39.00 | 1.00 | 60374.98 | 1.00 | 0.00 | 40223.74 | 2 | 0 | 1 | 1 |
| 4820 | 517.00 | 39.00 | 3.00 | 0.00 | 2.00 | 1.00 | 12465.51 | 2 | 0 | 1 | 1 |
| 3082 | 556.00 | 35.00 | 10.00 | 0.00 | 2.00 | 1.00 | 192751.18 | 0 | 0 | 0 | 1 |
| 2558 | 537.00 | 53.00 | 3.00 | 0.00 | 1.00 | 1.00 | 91406.62 | 0 | 0 | 0 | 0 |
| 9301 | 633.00 | 61.00 | 3.00 | 157201.48 | 1.00 | 1.00 | 50368.63 | 0 | 0 | 0 | 1 |
7978 rows × 11 columns
y_train
5800 0.00
5401 0.00
2369 1.00
7532 0.00
3495 0.00
...
9692 0.00
4820 0.00
3082 0.00
2558 0.00
9301 0.00
Name: exited, Length: 7978, dtype: float64
churn.head()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 | 2 | 0 | 1 | 1 |
X
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 2 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9997 | 741.00 | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 2 | 0 | 1 | 1 |
| 10006 | 659.00 | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0 | 0 | 0 | 1 |
| 10025 | 771.00 | 39.00 | 5.00 | 0.00 | 2.00 | 0.00 | 96270.64 | 0 | 0 | 0 | 1 |
| 10030 | 772.00 | 42.00 | 3.00 | 75075.31 | 2.00 | 0.00 | 92888.52 | 1 | 1 | 0 | 1 |
| 10031 | 792.00 | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0 | 0 | 0 | 0 |
9973 rows × 11 columns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)
X_train
array([[-0.26906411, 3.04660636, 0.69093269, ..., -0.57918359,
-0.57628887, 0.9139413 ],
[ 0.79139897, -1.03510395, -1.38121605, ..., -0.57918359,
1.73524086, 0.9139413 ],
[ 0.60607532, 0.57859547, -0.34514168, ..., 1.72656825,
-0.57628887, 0.9139413 ],
...,
[-0.96917566, -0.37063949, 1.72700706, ..., -0.57918359,
-0.57628887, 0.9139413 ],
[-1.16479506, 1.33798344, -0.6904998 , ..., -0.57918359,
-0.57628887, -1.09416217],
[-0.17640229, 2.0973714 , -0.6904998 , ..., -0.57918359,
-0.57628887, 0.9139413 ]])
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train,y_train)
LogisticRegression()
y_pred1=log.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)
0.8095238095238095
from sklearn.metrics import precision_score,recall_score,f1_score
precision_score(y_test,y_pred1)
0.5955882352941176
recall_score(y_test,y_pred1)
0.19950738916256158
f1_score(y_test,y_pred1)
0.2988929889298893
normal=churn[churn['exited']==0]
fraud=churn[churn['exited']==1]
normal.shape
(7943, 12)
fraud.shape
(2030, 12)
normal_sample=normal.sample(n=2055)
normal_sample.shape
(2055, 12)
new_churn=pd.concat([normal_sample,fraud])
new_churn['exited'].value_counts()
0.00 2055 1.00 2030 Name: exited, dtype: int64
new_churn.head()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3921 | 658.00 | 44.00 | 2.00 | 168396.34 | 1.00 | 1.00 | 14178.73 | 0.00 | 0 | 0 | 0 | 1 |
| 8311 | 704.00 | 36.00 | 2.00 | 175509.80 | 2.00 | 0.00 | 152039.67 | 0.00 | 2 | 0 | 1 | 0 |
| 5438 | 589.00 | 39.00 | 7.00 | 0.00 | 2.00 | 0.00 | 95985.64 | 0.00 | 0 | 0 | 0 | 0 |
| 3044 | 693.00 | 21.00 | 1.00 | 0.00 | 2.00 | 1.00 | 3494.02 | 0.00 | 0 | 0 | 0 | 1 |
| 8989 | 650.00 | 32.00 | 4.00 | 79450.09 | 1.00 | 1.00 | 118324.75 | 0.00 | 2 | 0 | 1 | 0 |
X=new_churn.drop('exited',axis=1)
y=new_churn['exited']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
log=LogisticRegression()
log.fit(X_train,y_train)
LogisticRegression()
y_pred1=log.predict(X_test)
accuracy_score(y_test,y_pred1)
0.594859241126071
from sklearn.metrics import precision_score,recall_score,f1_score
precision_score(y_test,y_pred1)
0.5618448637316562
recall_score(y_test,y_pred1)
0.6871794871794872
f1_score(y_test,y_pred1)
0.6182237600922722
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
DecisionTreeClassifier()
y_pred2=dt.predict(X_test)
accuracy_score(y_test,y_pred2)
0.7025703794369645
f1_score(y_test,y_pred2)
0.6864516129032259
precision_score(y_test,y_pred2)
0.6909090909090909
recall_score(y_test,y_pred2)
0.6820512820512821
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
RandomForestClassifier()
y_pred3=rf.predict(X_test)
accuracy_score(y_test,y_pred3)
0.7870257037943696
f1_score(y_test,y_pred3)
0.7740259740259741
precision_score(y_test,y_pred3)
0.7842105263157895
recall_score(y_test,y_pred3)
0.764102564102564
final_data=pd.DataFrame({'Models':['log','dt','rf'],
'ACC':[accuracy_score(y_test,y_pred1)*100,
accuracy_score(y_test,y_pred2)*100,
accuracy_score(y_test,y_pred3)*100]})
final_data
| Models | ACC | |
|---|---|---|
| 0 | log | 59.49 |
| 1 | dt | 70.26 |
| 2 | rf | 78.70 |
sns.barplot(final_data['Models'],final_data['ACC'])
<AxesSubplot:xlabel='Models', ylabel='ACC'>
churn.head()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 | 2 | 0 | 1 | 1 |
churn.shape
(9973, 12)
churn['exited'].value_counts()
0.00 7943 1.00 2030 Name: exited, dtype: int64
sns.countplot(churn['exited'])
<AxesSubplot:xlabel='exited', ylabel='count'>
X= churn.drop('exited',axis=1)
X
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 2 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9997 | 741.00 | 35.00 | 6.00 | 74371.49 | 1.00 | 0.00 | 99595.67 | 2 | 0 | 1 | 1 |
| 10006 | 659.00 | 36.00 | 6.00 | 123841.49 | 2.00 | 0.00 | 96833.00 | 0 | 0 | 0 | 1 |
| 10025 | 771.00 | 39.00 | 5.00 | 0.00 | 2.00 | 0.00 | 96270.64 | 0 | 0 | 0 | 1 |
| 10030 | 772.00 | 42.00 | 3.00 | 75075.31 | 2.00 | 0.00 | 92888.52 | 1 | 1 | 0 | 1 |
| 10031 | 792.00 | 28.00 | 4.00 | 130142.79 | 1.00 | 0.00 | 38190.78 | 0 | 0 | 0 | 0 |
9973 rows × 11 columns
y=churn['exited']
y
0 1.00
1 0.00
2 1.00
3 0.00
5 1.00
...
9997 0.00
10006 0.00
10025 0.00
10030 1.00
10031 0.00
Name: exited, Length: 9973, dtype: float64
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42,stratify=y)
X_train
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5800 | 624.00 | 71.00 | 7.00 | 0.00 | 2.00 | 1.00 | 108841.83 | 0 | 0 | 0 | 1 |
| 5401 | 727.00 | 28.00 | 1.00 | 0.00 | 1.00 | 0.00 | 40357.39 | 2 | 0 | 1 | 1 |
| 2369 | 709.00 | 45.00 | 4.00 | 122917.71 | 1.00 | 1.00 | 11.58 | 1 | 1 | 0 | 1 |
| 7532 | 623.00 | 35.00 | 0.00 | 130557.24 | 1.00 | 1.00 | 47880.71 | 0 | 0 | 0 | 0 |
| 3495 | 587.00 | 35.00 | 3.00 | 83286.56 | 1.00 | 0.00 | 125553.52 | 2 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9692 | 735.00 | 39.00 | 1.00 | 60374.98 | 1.00 | 0.00 | 40223.74 | 2 | 0 | 1 | 1 |
| 4820 | 517.00 | 39.00 | 3.00 | 0.00 | 2.00 | 1.00 | 12465.51 | 2 | 0 | 1 | 1 |
| 3082 | 556.00 | 35.00 | 10.00 | 0.00 | 2.00 | 1.00 | 192751.18 | 0 | 0 | 0 | 1 |
| 2558 | 537.00 | 53.00 | 3.00 | 0.00 | 1.00 | 1.00 | 91406.62 | 0 | 0 | 0 | 0 |
| 9301 | 633.00 | 61.00 | 3.00 | 157201.48 | 1.00 | 1.00 | 50368.63 | 0 | 0 | 0 | 1 |
7978 rows × 11 columns
y_train
5800 0.00
5401 0.00
2369 1.00
7532 0.00
3495 0.00
...
9692 0.00
4820 0.00
3082 0.00
2558 0.00
9301 0.00
Name: exited, Length: 7978, dtype: float64
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
X_train=minmax.fit_transform(X_train)
X_test=minmax.fit_transform(X_test)
X_train
array([[0.548 , 0.71621622, 0.7 , ..., 0. , 0. ,
1. ],
[0.754 , 0.13513514, 0.1 , ..., 0. , 1. ,
1. ],
[0.718 , 0.36486486, 0.4 , ..., 1. , 0. ,
1. ],
...,
[0.412 , 0.22972973, 1. , ..., 0. , 0. ,
1. ],
[0.374 , 0.47297297, 0.3 , ..., 0. , 0. ,
0. ],
[0.566 , 0.58108108, 0.3 , ..., 0. , 0. ,
1. ]])
from imblearn.over_sampling import SMOTE
SMOTE().fit_resample(X,y)
( creditscore age tenure balance numofproducts isactivemember \
0 619.00 42.00 2.00 0.00 1.00 1.00
1 608.00 41.00 1.00 83807.86 1.00 1.00
2 502.00 42.00 8.00 159660.80 3.00 0.00
3 699.00 39.00 1.00 0.00 2.00 0.00
4 645.00 44.00 8.00 113755.78 2.00 0.00
... ... ... ... ... ... ...
15881 757.07 60.50 7.98 125438.21 1.05 0.00
15882 579.83 49.40 2.35 100032.41 1.00 0.41
15883 601.86 43.62 2.57 0.00 1.57 0.57
15884 571.98 39.63 4.00 0.00 1.00 0.00
15885 607.70 45.60 4.02 0.00 1.49 0.00
estimatedsalary geography1 geography_Germany geography_Spain \
0 101348.88 0 0 0
1 112542.58 2 0 1
2 113931.57 0 0 0
3 93826.63 0 0 0
4 149756.71 2 0 1
... ... ... ... ...
15881 182223.25 0 0 0
15882 5928.03 1 0 0
15883 174137.68 0 0 0
15884 95434.59 0 0 0
15885 92236.98 1 0 0
gender_Male
0 0
1 0
2 0
3 0
4 1
... ...
15881 0
15882 1
15883 0
15884 0
15885 0
[15886 rows x 11 columns],
0 1.00
1 0.00
2 1.00
3 0.00
4 1.00
...
15881 1.00
15882 1.00
15883 1.00
15884 1.00
15885 1.00
Name: exited, Length: 15886, dtype: float64)
X_res,y_res=SMOTE().fit_resample(X,y)
y_res.value_counts()
1.00 7943 0.00 7943 Name: exited, dtype: int64
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.20,random_state=42,)
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train,y_train)
LogisticRegression()
y_pred1=log.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)
0.6784140969162996
from sklearn.metrics import precision_score,recall_score,f1_score
precision_score(y_test,y_pred1)
0.6680722891566265
recall_score(y_test,y_pred1)
0.7018987341772152
f1_score(y_test,y_pred1)
0.6845679012345679
from sklearn import svm
svm=svm.SVC()
svm.fit(X_train,y_train)
SVC()
y_pred2=svm.predict(X_test)
accuracy_score(y_test,y_pred2)
0.5670232850849591
precision_score(y_test,y_pred2)
0.5473098330241187
f1_score(y_test,y_pred2)
0.6316916488222698
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
KNeighborsClassifier()
y_pred3=knn.predict(X_test)
accuracy_score(y_test,y_pred3)
0.6768407803650094
precision_score(y_test,y_pred3)
0.6469962785752259
recall_score(y_test,y_pred3)
0.770253164556962
f1_score(y_test,y_pred3)
0.7032649523259173
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
DecisionTreeClassifier()
y_pred4=dt.predict(X_test)
accuracy_score(y_test,y_pred4)
0.8442416614222782
precision_score(y_test,y_pred4)
0.84098051539912
recall_score(y_test,y_pred4)
0.8468354430379746
f1_score(y_test,y_pred4)
0.8438978240302742
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
RandomForestClassifier()
y_pred5=rf.predict(X_test)
accuracy_score(y_test,y_pred5)
0.89490245437382
precision_score(y_test,y_pred5)
0.9192462987886945
recall_score(y_test,y_pred5)
0.8645569620253165
f1_score(y_test,y_pred5)
0.8910632746249184
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(X_train,y_train)
GradientBoostingClassifier()
y_pred6=gb.predict(X_test)
accuracy_score(y_test,y_pred6)
0.8926998112020138
precision_score(y_test,y_pred6)
0.9166106254203094
recall_score(y_test,y_pred6)
0.8626582278481013
f1_score(y_test,y_pred6)
0.8888164329964134
final_data1=pd.DataFrame({'Models':['log','svc','knn','dt','rf','gb'],
'ACC':[accuracy_score(y_test,y_pred1)*100,
accuracy_score(y_test,y_pred2)*100,
accuracy_score(y_test,y_pred3)*100,
accuracy_score(y_test,y_pred4)*100,
accuracy_score(y_test,y_pred5)*100,
accuracy_score(y_test,y_pred6)*100]})
final_data1
| Models | ACC | |
|---|---|---|
| 0 | log | 67.84 |
| 1 | svc | 56.70 |
| 2 | knn | 67.68 |
| 3 | dt | 84.42 |
| 4 | rf | 89.49 |
| 5 | gb | 89.27 |
sns.barplot(final_data1['Models'],final_data1['ACC'])
<AxesSubplot:xlabel='Models', ylabel='ACC'>
final_data2=pd.DataFrame({'Models':['log','svc','knn','dt','rf','gb'],
'PRE':[precision_score(y_test,y_pred1)*100,
precision_score(y_test,y_pred2)*100,
precision_score(y_test,y_pred3)*100,
precision_score(y_test,y_pred4)*100,
precision_score(y_test,y_pred5)*100,
precision_score(y_test,y_pred6)*100]})
final_data2
| Models | PRE | |
|---|---|---|
| 0 | log | 66.81 |
| 1 | svc | 54.73 |
| 2 | knn | 64.70 |
| 3 | dt | 84.10 |
| 4 | rf | 91.92 |
| 5 | gb | 91.66 |
sns.barplot(final_data2['Models'],final_data2['PRE'])
<AxesSubplot:xlabel='Models', ylabel='PRE'>
final_data3=pd.DataFrame({'Models':['log','svc','knn','dt','rf','gb'],
'REC':[recall_score(y_test,y_pred1)*100,
recall_score(y_test,y_pred2)*100,
recall_score(y_test,y_pred3)*100,
recall_score(y_test,y_pred4)*100,
recall_score(y_test,y_pred5)*100,
recall_score(y_test,y_pred6)*100]})
final_data3
| Models | REC | |
|---|---|---|
| 0 | log | 70.19 |
| 1 | svc | 74.68 |
| 2 | knn | 77.03 |
| 3 | dt | 84.68 |
| 4 | rf | 86.46 |
| 5 | gb | 86.27 |
sns.barplot(final_data3['Models'],final_data3['REC'])
<AxesSubplot:xlabel='Models', ylabel='REC'>
final_data4=pd.DataFrame({'Models':['log','svc','knn','dt','rf','gb'],
'F1':[f1_score(y_test,y_pred1)*100,
f1_score(y_test,y_pred2)*100,
f1_score(y_test,y_pred3)*100,
f1_score(y_test,y_pred4)*100,
f1_score(y_test,y_pred5)*100,
f1_score(y_test,y_pred6)*100]})
final_data4
| Models | F1 | |
|---|---|---|
| 0 | log | 68.46 |
| 1 | svc | 63.17 |
| 2 | knn | 70.33 |
| 3 | dt | 84.39 |
| 4 | rf | 89.11 |
| 5 | gb | 88.88 |
sns.barplot(final_data4['Models'],final_data4['F1'])
<AxesSubplot:xlabel='Models', ylabel='F1'>
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
X_res=minmax.fit_transform(X_res)
rf.fit(X_res,y_res)
RandomForestClassifier()
import joblib
joblib.dump(rf,'churn_predict_model')
['churn_predict_model']
model=joblib.load('churn_predict_model')
churn.columns
Index(['creditscore', 'age', 'tenure', 'balance', 'numofproducts',
'isactivemember', 'estimatedsalary', 'exited', 'geography1',
'geography_Germany', 'geography_Spain', 'gender_Male'],
dtype='object')
churn.head()
| creditscore | age | tenure | balance | numofproducts | isactivemember | estimatedsalary | exited | geography1 | geography_Germany | geography_Spain | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619.00 | 42.00 | 2.00 | 0.00 | 1.00 | 1.00 | 101348.88 | 1.00 | 0 | 0 | 0 | 0 |
| 1 | 608.00 | 41.00 | 1.00 | 83807.86 | 1.00 | 1.00 | 112542.58 | 0.00 | 2 | 0 | 1 | 0 |
| 2 | 502.00 | 42.00 | 8.00 | 159660.80 | 3.00 | 0.00 | 113931.57 | 1.00 | 0 | 0 | 0 | 0 |
| 3 | 699.00 | 39.00 | 1.00 | 0.00 | 2.00 | 0.00 | 93826.63 | 0.00 | 0 | 0 | 0 | 0 |
| 5 | 645.00 | 44.00 | 8.00 | 113755.78 | 2.00 | 0.00 | 149756.71 | 1.00 | 2 | 0 | 1 | 1 |
model.predict([[619,0,42,2,0.00,1.00,1.00,101348.88,1.00,0,0]])
array([0.])
from tkinter import *
import joblib
from tkinter import messagebox
master=Tk()
master.mainloop()
#ADD TITLE TO IT
master=Tk()
master.title('churn_predict_model')
master.mainloop()
#LETS SET DIMENSION OF THIS WINDOW
master=Tk()
master.title('churn_predict_model')
master.geometry('400x300')
master.minsize(200,200)
master.maxsize(600,600)
master.mainloop()
#LETS ADD COLOR TO THE WINDOW
master=Tk()
master.title('churn_predict_model')
master.geometry('400x300')
master.minsize(200,200)
master.maxsize(600,600)
master.configure(bg='lightblue')
master.mainloop()
master=Tk()
master.title('churn_predict_model')
master.geometry('400x300')
master.minsize(200,200)
master.maxsize(600,600)
master.configure(bg='lightblue')
label=Label(master,text='Bank churn',font=('Arial',20,'bold'),width=10,height=1,bg='lightblue',foreground='yellow').grid(row=0,columnspan=2)
Label(master,text='creditscore').grid(row=1)
Label(master,text='gender[0-1]').grid(row=2)
Label(master,text='age').grid(row=3)
Label(master,text='tenure').grid(row=4)
Label(master,text='balance').grid(row=5)
Label(master,text=' numofproducts').grid(row=6)
Label(master,text='isactivemember').grid(row=7)
Label(master,text='estimatedsalary').grid(row=8)
Label(master,text='geography1').grid(row=9)
Label(master,text='geography_Germany').grid(row=10)
Label(master,text='geography_Spain').grid(row=11)
master.mainloop()
master=Tk()
master.title('churn_predict_model')
master.geometry('400x300')
master.minsize(200,200)
master.maxsize(600,600)
master.configure(bg='lightblue')
label=Label(master,text='Bank churn',font=('Arial',20,'bold'),width=10,height=1,bg='lightblue',foreground='yellow').grid(row=0,columnspan=2)
Label(master,text='creditscore').grid(row=1)
Label(master,text='gender[0-1]').grid(row=2)
Label(master,text='age').grid(row=3)
Label(master,text='tenure').grid(row=4)
Label(master,text='balance').grid(row=5)
Label(master,text=' numofproducts').grid(row=6)
Label(master,text='isactivemember').grid(row=7)
Label(master,text='estimatedsalary').grid(row=8)
Label(master,text='geography1').grid(row=9)
Label(master,text='geography_Germany').grid(row=10)
Label(master,text='geography_Spain').grid(row=11)
e1=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e2=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e3=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e4=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e5=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e6=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e7=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e8=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e9=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e10=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
e11=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3,show='*')
master.mainloop()
master=Tk()
master.title('churn_predict_model')
master.geometry('400x300')
master.minsize(200,200)
master.maxsize(600,600)
master.configure(bg='lightblue')
label=Label(master,text='Bank churn',font=('Arial',20,'bold'),width=10,height=1,bg='lightblue',foreground='yellow').grid(row=0,columnspan=2)
Label(master,text='creditscore').grid(row=1)
Label(master,text='gender[0-1]').grid(row=2)
Label(master,text='age').grid(row=3)
Label(master,text='tenure').grid(row=4)
Label(master,text='balance').grid(row=5)
Label(master,text=' numofproducts').grid(row=6)
Label(master,text='isactivemember').grid(row=7)
Label(master,text='estimatedsalary').grid(row=8)
Label(master,text='geography1').grid(row=9)
Label(master,text='geography_Germany').grid(row=10)
Label(master,text='geography_Spain').grid(row=11)
e1=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e2=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e3=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e4=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e5=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e6=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e7=Entry( master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e8=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e9=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e10=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e11=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)
master.mainloop()
def show_entry():
p1=float(e1.get())
p2=float(e2.get())
p3=float(e3.get())
p4=float(e4.get())
p5=float(e5.get())
p6=float(e6.get())
p7=float(e6.get())
p8=float(e6.get())
p9=float(e6.get())
p10=float(e6.get())
p11=float(e6.get())
model=joblib.load('churn_predict_model')
result=model.predict([[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11]])
Label(master, text='Bank_churn').grid(row=12)
Label(master,text=result).grid(row=13)
master=Tk()
master.title('churn_predict_model')
master.geometry('400x300')
master.minsize(200,200)
master.maxsize(600,600)
master.configure(bg='lightblue')
label=Label(master,text='Bank_churn',font=('Arial',20,'bold'),width=10,height=1,bg='lightblue',foreground='yellow').grid(row=0,columnspan=2)
Label(master,text='creditscore').grid(row=1)
Label(master,text='gender[0-1]').grid(row=2)
Label(master,text='age').grid(row=3)
Label(master,text='tenure').grid(row=4)
Label(master,text='balance').grid(row=5)
Label(master,text=' numofproducts').grid(row=6)
Label(master,text='isactivemember').grid(row=7)
Label(master,text='estimatedsalary').grid(row=8)
Label(master,text='geography1').grid(row=9)
Label(master,text='geography_Germany').grid(row=10)
Label(master,text='geography_Spain').grid(row=11)
e1=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e2=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e3=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e4=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e5=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e6=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e7=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e8=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e9=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e10=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e11=Entry(master,font=('Arial',14),bg='grey',fg='white',borderwidth=3)
e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)
Button(master,text='predict',command=show_entry,bg='pink',font=('Arial',20,'bold'),borderwidth=3,activebackground='blue').grid()
master.mainloop()